diff --git a/final/.arcconfig b/final/.arcconfig
new file mode 100644
index 0000000..bd06ac8
--- /dev/null
+++ b/final/.arcconfig
@@ -0,0 +1,4 @@
+{
+  "repository.callsign" : "OMP",
+  "conduit_uri" : "https://reviews.llvm.org/"
+}
diff --git a/final/.gitignore b/final/.gitignore
new file mode 100644
index 0000000..d4bec15
--- /dev/null
+++ b/final/.gitignore
@@ -0,0 +1,42 @@
+#==============================================================================#
+# This file specifies intentionally untracked files that git should ignore.
+# See: http://www.kernel.org/pub/software/scm/git/docs/gitignore.html
+#
+# This file is intentionally different from the output of `git svn show-ignore`,
+# as most of those are useless.
+#==============================================================================#
+
+#==============================================================================#
+# File extensions to be ignored anywhere in the tree.
+#==============================================================================#
+# Temp files created by most text editors.
+*~
+# Merge files created by git.
+*.orig
+# Byte compiled python modules.
+*.pyc
+# vim swap files
+.*.sw?
+.sw?
+#OS X specific files.
+.DS_store
+
+#==============================================================================#
+# Explicit files to ignore (only matches one).
+#==============================================================================#
+# Various tag programs
+tags
+/TAGS
+/GPATH
+/GRTAGS
+/GSYMS
+/GTAGS
+.gitusers
+
+#==============================================================================#
+# Directories to ignore (do not add trailing '/'s, they skip symlinks).
+#==============================================================================#
+runtime/exports
+
+# Nested build directory
+/build
diff --git a/final/CMakeLists.txt b/final/CMakeLists.txt
new file mode 100644
index 0000000..597eedc
--- /dev/null
+++ b/final/CMakeLists.txt
@@ -0,0 +1,83 @@
+cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
+
+# Add cmake directory to search for custom cmake functions.
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+# llvm/runtimes/ will set OPENMP_STANDALONE_BUILD.
+if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+  set(OPENMP_STANDALONE_BUILD TRUE)
+  project(openmp C CXX)
+
+  # CMAKE_BUILD_TYPE was not set, default to Release.
+  if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+  endif()
+
+  # Group common settings.
+  set(OPENMP_ENABLE_WERROR FALSE CACHE BOOL
+    "Enable -Werror flags to turn warnings into errors for supporting compilers.")
+  set(OPENMP_LIBDIR_SUFFIX "" CACHE STRING
+    "Suffix of lib installation directory, e.g. 64 => lib64")
+  # Do not use OPENMP_LIBDIR_SUFFIX directly, use OPENMP_INSTALL_LIBDIR.
+  set(OPENMP_INSTALL_LIBDIR "lib${OPENMP_LIBDIR_SUFFIX}")
+
+  # Group test settings.
+  set(OPENMP_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING
+    "C compiler to use for testing OpenMP runtime libraries.")
+  set(OPENMP_TEST_CXX_COMPILER ${CMAKE_CXX_COMPILER} CACHE STRING
+    "C++ compiler to use for testing OpenMP runtime libraries.")
+  set(OPENMP_LLVM_TOOLS_DIR "" CACHE PATH "Path to LLVM tools for testing.")
+else()
+  set(OPENMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR})
+  # If building in tree, we honor the same install suffix LLVM uses.
+  set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}")
+
+  if (NOT MSVC)
+    set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
+    set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++)
+  else()
+    set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang.exe)
+    set(OPENMP_TEST_CXX_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang++.exe)
+  endif()
+endif()
+
+# Check and set up common compiler flags.
+include(config-ix)
+include(HandleOpenMPOptions)
+
+# Set up testing infrastructure.
+include(OpenMPTesting)
+
+set(OPENMP_TEST_FLAGS "" CACHE STRING
+  "Extra compiler flags to send to the test compiler.")
+set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING
+  "OpenMP compiler flag to use for testing OpenMP runtime libraries.")
+
+
+# Build host runtime library.
+add_subdirectory(runtime)
+
+
+set(ENABLE_LIBOMPTARGET ON)
+# Currently libomptarget cannot be compiled on Windows or MacOS X.
+# Since the device plugins are only supported on Linux anyway,
+# there is no point in trying to compile libomptarget on other OSes.
+if (APPLE OR WIN32 OR NOT OPENMP_HAVE_STD_CPP11_FLAG)
+  set(ENABLE_LIBOMPTARGET OFF)
+endif()
+
+option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
+       ${ENABLE_LIBOMPTARGET})
+if (OPENMP_ENABLE_LIBOMPTARGET)
+  # Check that the library can acutally be built.
+  if (APPLE OR WIN32)
+    message(FATAL_ERROR "libomptarget cannot be built on Windows and MacOS X!")
+  elseif (NOT OPENMP_HAVE_STD_CPP11_FLAG)
+    message(FATAL_ERROR "Host compiler must support C++11 to build libomptarget!")
+  endif()
+
+  add_subdirectory(libomptarget)
+endif()
+
+# Now that we have seen all testuites, create the check-openmp target.
+construct_check_openmp_target()
diff --git a/final/CREDITS.txt b/final/CREDITS.txt
new file mode 100644
index 0000000..b14bb9a
--- /dev/null
+++ b/final/CREDITS.txt
@@ -0,0 +1,61 @@
+This file is a partial list of people who have contributed to the LLVM/openmp
+project.  If you have contributed a patch or made some other contribution to
+LLVM/openmp, please submit a patch to this file to add yourself, and it will be
+done!
+
+The list is sorted by surname and formatted to allow easy grepping and
+beautification by scripts.  The fields are: name (N), email (E), web-address
+(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
+(S).
+
+N: Adam Azarchs
+W: 10xgenomics.com
+D: Bug fix for lock code
+
+N: Carlo Bertolli
+W: http://ibm.com
+D: IBM contributor to PowerPC support in CMake files and elsewhere.
+
+N: Diego Caballero
+E: diego.l.caballero@gmail.com
+D: Fork performance improvements
+
+N: Sunita Chandrasekaran
+D: Contributor to testsuite from OpenUH
+
+N: Barbara Chapman
+D: Contributor to testsuite from OpenUH
+
+N: University of Houston
+W: http://web.cs.uh.edu/~openuh/download/
+D: OpenUH test suite
+
+N: Intel Corporation OpenMP runtime team
+W: http://openmprtl.org
+D: Created the runtime.
+
+N: John Mellor-Crummey and other members of the OpenMP Tools Working Group
+E: johnmc@rice.edu
+D: OpenMP Tools Interface (OMPT)
+
+N: Matthias Muller
+D: Contributor to testsuite from OpenUH
+
+N: Tal Nevo
+E: tal@scalemp.com
+D: ScaleMP contributor to improve runtime performance there.
+W: http://scalemp.com
+
+N: Pavel Neytchev
+D: Contributor to testsuite from OpenUH
+
+N: Steven Noonan
+E: steven@uplinklabs.net
+D: Patches for the ARM architecture and removal of several inconsistencies.
+
+N: Alp Toker
+E: alp@nuanti.com
+D: Making build work for FreeBSD.
+
+N: Cheng Wang
+D: Contributor to testsuite from OpenUH
diff --git a/final/LICENSE.txt b/final/LICENSE.txt
new file mode 100644
index 0000000..9907566
--- /dev/null
+++ b/final/LICENSE.txt
@@ -0,0 +1,361 @@
+==============================================================================
+The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+==============================================================================
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+==============================================================================
+Software from third parties included in the LLVM Project:
+==============================================================================
+The LLVM Project contains third party software which is under different license
+terms. All such code will be identified clearly using at least one of two
+mechanisms:
+1) It will be in a separate directory tree with its own `LICENSE.txt` or
+   `LICENSE` file at the top containing the specific license and restrictions
+   which apply to that software, or
+2) It will contain specific license and restriction terms at the top of every
+   file.
+
+==============================================================================
+Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
+==============================================================================
+
+The software contained in this directory tree is dual licensed under both the
+University of Illinois "BSD-Like" license and the MIT license.  As a user of
+this code you may choose to use it under either license.  As a contributor,
+you agree to allow your code to be used under both.  The full text of the
+relevant licenses is included below.
+
+In addition, a license agreement from the copyright/patent holders of the
+software contained in this directory tree is included below.
+
+==============================================================================
+
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 1997-2019 Intel Corporation
+
+All rights reserved.
+
+Developed by:
+    OpenMP Runtime Team
+    Intel Corporation
+    http://www.openmprtl.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of Intel Corporation OpenMP Runtime Team nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this Software without specific prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+
+Copyright (c) 1997-2019 Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+==============================================================================
+
+Intel Corporation
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, Intel Corporation ("Intel") reserves
+all right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by Intel to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+   Agreement, Intel hereby grants to you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable copyright license to reproduce, prepare derivative
+   works of, publicly display, publicly perform, sublicense, and distribute the
+   Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+   Agreement, Intel hereby grants you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable (except as stated in this section) patent license
+   to make, have made, use, offer to sell, sell, import, and otherwise transfer
+   the Work, where such license applies only to those patent claims licensable
+   by Intel that are necessarily infringed by Intel's Software alone or by
+   combination of the Software with the Work to which such Software was
+   submitted. If any entity institutes patent litigation against Intel or any
+   other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+   that Intel's Software, or the Work to which Intel has contributed constitutes
+   direct or contributory patent infringement, then any patent licenses granted
+   to that entity under this Agreement for the Software or Work shall terminate
+   as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
+
+==============================================================================
diff --git a/final/README.rst b/final/README.rst
new file mode 100644
index 0000000..5e28b9a
--- /dev/null
+++ b/final/README.rst
@@ -0,0 +1,339 @@
+========================================
+How to Build the LLVM* OpenMP* Libraries
+========================================
+This repository requires `CMake <http://www.cmake.org/>`_ v2.8.0 or later.  LLVM
+and Clang need a more recent version which also applies for in-tree builds.  For
+more information than available in this document please see
+`LLVM's CMake documentation <http://llvm.org/docs/CMake.html>`_ and the
+`official documentation <https://cmake.org/cmake/help/v2.8.0/cmake.html>`_.
+
+.. contents::
+   :local:
+
+How to Call CMake Initially, then Repeatedly
+============================================
+- When calling CMake for the first time, all needed compiler options must be
+  specified on the command line.  After this initial call to CMake, the compiler
+  definitions must not be included for further calls to CMake.  Other options
+  can be specified on the command line multiple times including all definitions
+  in the build options section below.
+- Example of configuring, building, reconfiguring, rebuilding:
+
+  .. code-block:: console
+
+    $ mkdir build
+    $ cd build
+    $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..  # Initial configuration
+    $ make
+    ...
+    $ make clean
+    $ cmake -DCMAKE_BUILD_TYPE=Debug ..                               # Second configuration
+    $ make
+    ...
+    $ rm -rf *
+    $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ ..        # Third configuration
+    $ make
+
+- Notice in the example how the compiler definitions are only specified for an
+  empty build directory, but other build options are used at any time.
+- The file ``CMakeCache.txt`` which is created after the first call to CMake is
+  a configuration file which holds all values for the build options.  These
+  values can be changed using a text editor to modify ``CMakeCache.txt`` as
+  opposed to using definitions on the command line.
+- To have CMake create a particular type of build generator file simply include
+  the ``-G <Generator name>`` option:
+
+  .. code-block:: console
+
+    $ cmake -G "Unix Makefiles" ...
+
+  You can see a list of generators CMake supports by executing the cmake command
+  with no arguments.
+
+Instructions to Build
+=====================
+.. code-block:: console
+
+ $ cd openmp_top_level/ [ this directory with libomptarget/, runtime/, etc. ]
+ $ mkdir build
+ $ cd build
+
+ [ Unix* Libraries ]
+ $ cmake -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> ..
+
+ [ Windows* Libraries ]
+ $ cmake -G <Generator Type> -DCMAKE_C_COMPILER=<C Compiler> -DCMAKE_CXX_COMPILER=<C++ Compiler> -DCMAKE_ASM_MASM_COMPILER=[ml | ml64] -DCMAKE_BUILD_TYPE=Release ..
+
+ $ make
+ $ make install
+
+CMake Options
+=============
+Builds with CMake can be customized by means of options as already seen above.
+One possibility is to pass them via the command line:
+
+.. code-block:: console
+
+  $ cmake -DOPTION=<value> path/to/source
+
+.. note:: The first value listed is the respective default for that option.
+
+Generic Options
+---------------
+For full documentation consult the CMake manual or execute
+``cmake --help-variable VARIABLE_NAME`` to get information about a specific
+variable.
+
+**CMAKE_BUILD_TYPE** = ``Release|Debug|RelWithDebInfo``
+  Build type can be ``Release``, ``Debug``, or ``RelWithDebInfo`` which chooses
+  the optimization level and presence of debugging symbols.
+
+**CMAKE_C_COMPILER** = <C compiler name>
+  Specify the C compiler.
+
+**CMAKE_CXX_COMPILER** = <C++ compiler name>
+  Specify the C++ compiler.
+
+**CMAKE_Fortran_COMPILER** = <Fortran compiler name>
+  Specify the Fortran compiler. This option is only needed when
+  **LIBOMP_FORTRAN_MODULES** is ``ON`` (see below).  So typically, a Fortran
+  compiler is not needed during the build.
+
+**CMAKE_ASM_MASM_COMPILER** = ``ml|ml64``
+  This option is only relevant for Windows*.
+
+Options for all Libraries
+-------------------------
+
+**OPENMP_ENABLE_WERROR** = ``OFF|ON``
+  Treat warnings as errors and fail, if a compiler warning is triggered.
+
+**OPENMP_LIBDIR_SUFFIX** = ``""``
+  Extra suffix to append to the directory where libraries are to be installed.
+
+**OPENMP_TEST_C_COMPILER** = ``${CMAKE_C_COMPILER}``
+  Compiler to use for testing. Defaults to the compiler that was also used for
+  building.
+
+**OPENMP_TEST_CXX_COMPILER** = ``${CMAKE_CXX_COMPILER}``
+  Compiler to use for testing. Defaults to the compiler that was also used for
+  building.
+
+**OPENMP_LLVM_TOOLS_DIR** = ``/path/to/built/llvm/tools``
+  Additional path to search for LLVM tools needed by tests.
+
+**OPENMP_LLVM_LIT_EXECUTABLE** = ``/path/to/llvm-lit``
+  Specify full path to ``llvm-lit`` executable for running tests.  The default
+  is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**.
+
+**OPENMP_FILECHECK_EXECUTABLE** = ``/path/to/FileCheck``
+  Specify full path to ``FileCheck`` executable for running tests.  The default
+  is to search the ``PATH`` and the directory in **OPENMP_LLVM_TOOLS_DIR**.
+
+Options for ``libomp``
+----------------------
+
+**LIBOMP_ARCH** = ``aarch64|arm|i386|mic|mips|mips64|ppc64|ppc64le|x86_64``
+  The default value for this option is chosen based on probing the compiler for
+  architecture macros (e.g., is ``__x86_64__`` predefined by compiler?).
+
+**LIBOMP_MIC_ARCH** = ``knc|knf``
+  Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) to
+  build for.  This value is ignored if **LIBOMP_ARCH** does not equal ``mic``.
+
+**LIBOMP_LIB_TYPE** = ``normal|profile|stubs``
+  Library type can be ``normal``, ``profile``, or ``stubs``.
+
+**LIBOMP_USE_VERSION_SYMBOLS** = ``ON|OFF``
+  Use versioned symbols for building the library.  This option only makes sense
+  for ELF based libraries where version symbols are supported (Linux*, some BSD*
+  variants).  It is ``OFF`` by default for Windows* and macOS*, but ``ON`` for
+  other Unix based operating systems.
+
+**LIBOMP_ENABLE_SHARED** = ``ON|OFF``
+  Build a shared library.  If this option is ``OFF``, static OpenMP libraries
+  will be built instead of dynamic ones.
+
+  .. note::
+
+    Static libraries are not supported on Windows*.
+
+**LIBOMP_FORTRAN_MODULES** = ``OFF|ON``
+  Create the Fortran modules (requires Fortran compiler).
+
+macOS* Fat Libraries
+""""""""""""""""""""
+On macOS* machines, it is possible to build universal (or fat) libraries which
+include both i386 and x86_64 architecture objects in a single archive.
+
+.. code-block:: console
+
+  $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_OSX_ARCHITECTURES='i386;x86_64' ..
+  $ make
+
+There is also an option **LIBOMP_OSX_ARCHITECTURES** which can be set in case
+this is an LLVM source tree build. It will only apply for the ``libomp`` library
+avoids having the entire LLVM/Clang build produce universal binaries.
+
+Optional Features
+"""""""""""""""""
+
+**LIBOMP_USE_ADAPTIVE_LOCKS** = ``ON|OFF``
+  Include adaptive locks, based on Intel(R) Transactional Synchronization
+  Extensions (Intel(R) TSX).  This feature is x86 specific and turned ``ON``
+  by default for IA-32 architecture and Intel(R) 64 architecture.
+
+**LIBOMP_USE_INTERNODE_ALIGNMENT** = ``OFF|ON``
+  Align certain data structures on 4096-byte.  This option is useful on
+  multi-node systems where a small ``CACHE_LINE`` setting leads to false sharing.
+
+**LIBOMP_OMPT_SUPPORT** = ``ON|OFF``
+  Include support for the OpenMP Tools Interface (OMPT).
+  This option is supported and ``ON`` by default for x86, x86_64, AArch64, and
+  PPC64 on Linux* and macOS*.
+  This option is ``OFF`` if this feature is not supported for the platform.
+
+**LIBOMP_OMPT_OPTIONAL** = ``ON|OFF``
+  Include support for optional OMPT functionality.  This option is ignored if
+  **LIBOMP_OMPT_SUPPORT** is ``OFF``.
+
+**LIBOMP_STATS** = ``OFF|ON``
+  Include stats-gathering code.
+
+**LIBOMP_USE_DEBUGGER** = ``OFF|ON``
+  Include the friendly debugger interface.
+
+**LIBOMP_USE_HWLOC** = ``OFF|ON``
+  Use `OpenMPI's hwloc library <https://www.open-mpi.org/projects/hwloc/>`_ for
+  topology detection and affinity.
+
+**LIBOMP_HWLOC_INSTALL_DIR** = ``/path/to/hwloc/install/dir``
+  Specify install location of hwloc.  The configuration system will look for
+  ``hwloc.h`` in ``${LIBOMP_HWLOC_INSTALL_DIR}/include`` and the library in
+  ``${LIBOMP_HWLOC_INSTALL_DIR}/lib``.  The default is ``/usr/local``.
+  This option is only used if **LIBOMP_USE_HWLOC** is ``ON``.
+
+Additional Compiler Flags
+"""""""""""""""""""""""""
+
+These flags are **appended**, they do not overwrite any of the preset flags.
+
+**LIBOMP_CPPFLAGS** = <space-separated flags>
+  Additional C preprocessor flags.
+
+**LIBOMP_CFLAGS** = <space-separated flags>
+  Additional C compiler flags.
+
+**LIBOMP_CXXFLAGS** = <space-separated flags>
+  Additional C++ compiler flags.
+
+**LIBOMP_ASMFLAGS** = <space-separated flags>
+  Additional assembler flags.
+
+**LIBOMP_LDFLAGS** = <space-separated flags>
+  Additional linker flags.
+
+**LIBOMP_LIBFLAGS** = <space-separated flags>
+  Additional libraries to link.
+
+**LIBOMP_FFLAGS** = <space-separated flags>
+  Additional Fortran compiler flags.
+
+Options for ``libomptarget``
+----------------------------
+
+**LIBOMPTARGET_OPENMP_HEADER_FOLDER** = ``""``
+  Path of the folder that contains ``omp.h``.  This is required for testing
+  out-of-tree builds.
+
+**LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER** = ``""``
+  Path of the folder that contains ``libomp.so``.  This is required for testing
+  out-of-tree builds.
+
+Options for ``NVPTX device RTL``
+--------------------------------
+
+**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``ON|OFF``
+  Enable CUDA LLVM bitcode offloading device RTL. This is used for link time
+  optimization of the OMP runtime and application code. This option is enabled
+  by default if the build system determines that `CMAKE_C_COMPILER` is able to
+  compile and link the library.
+
+**LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""``
+  Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only
+  the Clang compiler is supported. This is only used when building the CUDA LLVM
+  bitcode offloading device RTL. If unspecified and the CMake C compiler is
+  Clang, then Clang is used.
+
+**LIBOMPTARGET_NVPTX_BC_LINKER** = ``""``
+  Location of a linker capable of linking LLVM bitcode objects. This is only
+  used when building the CUDA LLVM bitcode offloading device RTL. If unspecified
+  and the CMake C compiler is Clang and there exists a llvm-link binary in the
+  directory containing Clang, then this llvm-link binary is used.
+
+**LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER** = ``""``
+  Host compiler to use with NVCC. This compiler is not going to be used to
+  produce any binary. Instead, this is used to overcome the input compiler
+  checks done by NVCC. E.g. if using a default host compiler that is not
+  compatible with NVCC, this option can be use to pass to NVCC a valid compiler
+  to avoid the error.
+
+ **LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES** = ``35``
+  List of CUDA compute capabilities that should be supported by the NVPTX
+  device RTL. E.g. for compute capabilities 6.0 and 7.0, the option "60,70"
+  should be used. Compute capability 3.5 is the minimum required.
+
+ **LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON``
+  Enable printing of debug messages from the NVPTX device RTL.
+
+Example Usages of CMake
+=======================
+
+Typical Invocations
+-------------------
+
+.. code-block:: console
+
+  $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+  $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ ..
+  $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc ..
+
+Advanced Builds with Various Options
+------------------------------------
+
+- Build the i386 Linux* library using GCC*
+
+  .. code-block:: console
+
+    $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_ARCH=i386 ..
+
+- Build the x86_64 debug Mac library using Clang*
+
+  .. code-block:: console
+
+    $ cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DLIBOMP_ARCH=x86_64 -DCMAKE_BUILD_TYPE=Debug ..
+
+- Build the library (architecture determined by probing compiler) using the
+  Intel(R) C Compiler and the Intel(R) C++ Compiler.  Also, create Fortran
+  modules with the Intel(R) Fortran Compiler.
+
+  .. code-block:: console
+
+    $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_Fortran_COMPILER=ifort -DLIBOMP_FORTRAN_MODULES=on ..
+
+- Have CMake find the C/C++ compiler and specify additional flags for the C
+  compiler, preprocessor, and C++ compiler.
+
+  .. code-blocks:: console
+
+    $ cmake -DLIBOMP_CFLAGS='-specific-flag' -DLIBOMP_CPPFLAGS='-DNEW_FEATURE=1 -DOLD_FEATURE=0' -DLIBOMP_CXXFLAGS='--one-specific-flag --two-specific-flag' ..
+
+- Build the stubs library
+
+  .. code-blocks:: console
+
+    $ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLIBOMP_LIB_TYPE=stubs ..
+
+**Footnotes**
+
+.. [*] Other names and brands may be claimed as the property of others.
diff --git a/final/cmake/DetectTestCompiler/CMakeLists.txt b/final/cmake/DetectTestCompiler/CMakeLists.txt
new file mode 100644
index 0000000..1fd7cc7
--- /dev/null
+++ b/final/cmake/DetectTestCompiler/CMakeLists.txt
@@ -0,0 +1,43 @@
+cmake_minimum_required(VERSION 2.8)
+project(DetectTestCompiler C CXX)
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+function(write_compiler_information lang)
+  set(information "${CMAKE_${lang}_COMPILER}")
+  set(information "${information}\\;${CMAKE_${lang}_COMPILER_ID}")
+  set(information "${information}\\;${CMAKE_${lang}_COMPILER_VERSION}")
+  set(information "${information}\\;${${lang}_FLAGS}")
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${lang}CompilerInformation.txt ${information})
+endfunction(write_compiler_information)
+
+find_package(OpenMP)
+if (NOT OpenMP_Found)
+  set(OpenMP_C_FLAGS "-fopenmp")
+  set(OpenMP_CXX_FLAGS "-fopenmp")
+endif()
+
+set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+find_package(Threads REQUIRED)
+
+set(C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_THREAD_LIBS_INIT}")
+set(CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_THREAD_LIBS_INIT}")
+
+# TODO: Implement blockaddress in GlobalISel and remove this flag!
+if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  check_c_compiler_flag("-fno-experimental-isel" C_HAS_EXPERIMENTAL_ISEL_FLAG)
+  check_cxx_compiler_flag("-fno-experimental-isel" CXX_HAS_EXPERIMENTAL_ISEL_FLAG)
+  macro(add_experimental_isel_flag lang)
+    if (${lang}_HAS_EXPERIMENTAL_ISEL_FLAG)
+      set(${lang}_FLAGS "-fno-experimental-isel ${${lang}_FLAGS}")
+    endif()
+  endmacro(add_experimental_isel_flag)
+
+  add_experimental_isel_flag(C)
+  add_experimental_isel_flag(CXX)
+endif()
+
+write_compiler_information(C)
+write_compiler_information(CXX)
diff --git a/final/cmake/HandleOpenMPOptions.cmake b/final/cmake/HandleOpenMPOptions.cmake
new file mode 100644
index 0000000..97b616e
--- /dev/null
+++ b/final/cmake/HandleOpenMPOptions.cmake
@@ -0,0 +1,19 @@
+if (${OPENMP_STANDALONE_BUILD})
+  # From HandleLLVMOptions.cmake
+  function(append_if condition value)
+    if (${condition})
+      foreach(variable ${ARGN})
+        set(${variable} "${${variable}} ${value}" PARENT_SCOPE)
+      endforeach(variable)
+    endif()
+  endfunction()
+endif()
+
+if (${OPENMP_ENABLE_WERROR})
+  append_if(OPENMP_HAVE_WERROR_FLAG "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
+
+append_if(OPENMP_HAVE_STD_GNUPP11_FLAG "-std=gnu++11" CMAKE_CXX_FLAGS)
+if (NOT OPENMP_HAVE_STD_GNUPP11_FLAG)
+  append_if(OPENMP_HAVE_STD_CPP11_FLAG "-std=c++11" CMAKE_CXX_FLAGS)
+endif()
diff --git a/final/cmake/OpenMPTesting.cmake b/final/cmake/OpenMPTesting.cmake
new file mode 100644
index 0000000..52e68aa
--- /dev/null
+++ b/final/cmake/OpenMPTesting.cmake
@@ -0,0 +1,191 @@
+# Keep track if we have all dependencies.
+set(ENABLE_CHECK_TARGETS TRUE)
+
+# Function to find required dependencies for testing.
+function(find_standalone_test_dependencies)
+  include(FindPythonInterp)
+
+  if (NOT PYTHONINTERP_FOUND)
+    message(STATUS "Could not find Python.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    return()
+  endif()
+
+  # Find executables.
+  find_program(OPENMP_LLVM_LIT_EXECUTABLE
+    NAMES llvm-lit lit.py lit
+    PATHS ${OPENMP_LLVM_TOOLS_DIR})
+  if (NOT OPENMP_LLVM_LIT_EXECUTABLE)
+    message(STATUS "Cannot find llvm-lit.")
+    message(STATUS "Please put llvm-lit in your PATH, set OPENMP_LLVM_LIT_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    return()
+  endif()
+
+  find_program(OPENMP_FILECHECK_EXECUTABLE
+    NAMES FileCheck
+    PATHS ${OPENMP_LLVM_TOOLS_DIR})
+  if (NOT OPENMP_FILECHECK_EXECUTABLE)
+    message(STATUS "Cannot find FileCheck.")
+    message(STATUS "Please put FileCheck in your PATH, set OPENMP_FILECHECK_EXECUTABLE to its full path, or point OPENMP_LLVM_TOOLS_DIR to its directory.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+    return()
+  endif()
+endfunction()
+
+if (${OPENMP_STANDALONE_BUILD})
+  find_standalone_test_dependencies()
+
+  # Make sure we can use the console pool for recent CMake and Ninja > 1.5.
+  if (CMAKE_VERSION VERSION_LESS 3.1.20141117)
+    set(cmake_3_2_USES_TERMINAL)
+  else()
+    set(cmake_3_2_USES_TERMINAL USES_TERMINAL)
+  endif()
+
+  # Set lit arguments.
+  set(DEFAULT_LIT_ARGS "-sv --show-unsupported --show-xfail")
+  if (MSVC OR XCODE)
+    set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar")
+  endif()
+  set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.")
+  separate_arguments(OPENMP_LIT_ARGS)
+else()
+  set(OPENMP_FILECHECK_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/FileCheck)
+endif()
+
+# Macro to extract information about compiler from file. (no own scope)
+macro(extract_test_compiler_information lang file)
+  file(READ ${file} information)
+  list(GET information 0 path)
+  list(GET information 1 id)
+  list(GET information 2 version)
+  list(GET information 3 openmp_flags)
+
+  set(OPENMP_TEST_${lang}_COMPILER_PATH ${path})
+  set(OPENMP_TEST_${lang}_COMPILER_ID ${id})
+  set(OPENMP_TEST_${lang}_COMPILER_VERSION ${version})
+  set(OPENMP_TEST_${lang}_COMPILER_OPENMP_FLAGS ${openmp_flags})
+endmacro()
+
+# Function to set variables with information about the test compiler.
+function(set_test_compiler_information dir)
+  extract_test_compiler_information(C ${dir}/CCompilerInformation.txt)
+  extract_test_compiler_information(CXX ${dir}/CXXCompilerInformation.txt)
+  if (NOT("${OPENMP_TEST_C_COMPILER_ID}" STREQUAL "${OPENMP_TEST_CXX_COMPILER_ID}" AND
+          "${OPENMP_TEST_C_COMPILER_VERSION}" STREQUAL "${OPENMP_TEST_CXX_COMPILER_VERSION}"))
+    message(STATUS "Test compilers for C and C++ don't match.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_TARGETS FALSE PARENT_SCOPE)
+  else()
+    set(OPENMP_TEST_COMPILER_ID "${OPENMP_TEST_C_COMPILER_ID}" PARENT_SCOPE)
+    set(OPENMP_TEST_COMPILER_VERSION "${OPENMP_TEST_C_COMPILER_VERSION}" PARENT_SCOPE)
+    set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "${OPENMP_TEST_C_COMPILER_OPENMP_FLAGS}" PARENT_SCOPE)
+
+    # Determine major version.
+    string(REGEX MATCH "[0-9]+" major "${OPENMP_TEST_C_COMPILER_VERSION}")
+    string(REGEX MATCH "[0-9]+\\.[0-9]+" majorminor "${OPENMP_TEST_C_COMPILER_VERSION}")
+    set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${major}" PARENT_SCOPE)
+    set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${majorminor}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+if (${OPENMP_STANDALONE_BUILD})
+  # Detect compiler that should be used for testing.
+  # We cannot use ExternalProject_Add() because its configuration runs when this
+  # project is built which is too late for detecting the compiler...
+  file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler)
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -G${CMAKE_GENERATOR} ${CMAKE_CURRENT_LIST_DIR}/DetectTestCompiler
+      -DCMAKE_C_COMPILER=${OPENMP_TEST_C_COMPILER}
+      -DCMAKE_CXX_COMPILER=${OPENMP_TEST_CXX_COMPILER}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler
+    OUTPUT_VARIABLE DETECT_COMPILER_OUT
+    ERROR_VARIABLE DETECT_COMPILER_ERR
+    RESULT_VARIABLE DETECT_COMPILER_RESULT)
+  if (DETECT_COMPILER_RESULT)
+    message(STATUS "Could not detect test compilers.")
+    message(WARNING "The check targets will not be available!")
+    set(ENABLE_CHECK_TARGETS FALSE)
+  else()
+    set_test_compiler_information(${CMAKE_CURRENT_BINARY_DIR}/DetectTestCompiler)
+  endif()
+else()
+  # Set the information that we know.
+  set(OPENMP_TEST_COMPILER_ID "Clang")
+  # Cannot use CLANG_VERSION because we are not guaranteed that this is already set.
+  set(OPENMP_TEST_COMPILER_VERSION "${LLVM_VERSION}")
+  set(OPENMP_TEST_COMPILER_VERSION_MAJOR "${LLVM_MAJOR_VERSION}")
+  set(OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR "${LLVM_MAJOR_VERSION}.${LLVM_MINOR_VERSION}")
+  # Unfortunately the top-level cmake/config-ix.cmake file mangles CMake's
+  # CMAKE_THREAD_LIBS_INIT variable from the FindThreads package, so work
+  # around that, until it is fixed there.
+  if("${CMAKE_THREAD_LIBS_INIT}" STREQUAL "-lpthread")
+    set(OPENMP_TEST_COMPILER_THREAD_FLAGS "-pthread")
+  else()
+    set(OPENMP_TEST_COMPILER_THREAD_FLAGS "${CMAKE_THREAD_LIBS_INIT}")
+  endif()
+  # TODO: Implement blockaddress in GlobalISel and remove this flag!
+  set(OPENMP_TEST_COMPILER_OPENMP_FLAGS "-fopenmp ${OPENMP_TEST_COMPILER_THREAD_FLAGS} -fno-experimental-isel")
+endif()
+
+# Function to set compiler features for use in lit.
+function(set_test_compiler_features)
+  if ("${OPENMP_TEST_COMPILER_ID}" STREQUAL "GNU")
+    set(comp "gcc")
+  elseif ("${OPENMP_TEST_COMPILER_ID}" STREQUAL "Intel")
+    set(comp "icc")
+  else()
+    # Just use the lowercase of the compiler ID as fallback.
+    string(TOLOWER "${OPENMP_TEST_COMPILER_ID}" comp)
+  endif()
+  set(OPENMP_TEST_COMPILER_FEATURES "['${comp}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION_MAJOR_MINOR}', '${comp}-${OPENMP_TEST_COMPILER_VERSION}']" PARENT_SCOPE)
+endfunction()
+set_test_compiler_features()
+
+# Function to add a testsuite for an OpenMP runtime library.
+function(add_openmp_testsuite target comment)
+  if (NOT ENABLE_CHECK_TARGETS)
+    add_custom_target(${target}
+      COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, dependencies not found.")
+    message(STATUS "${target} does nothing.")
+    return()
+  endif()
+
+  cmake_parse_arguments(ARG "" "" "DEPENDS;ARGS" ${ARGN})
+  # EXCLUDE_FROM_ALL excludes the test ${target} out of check-openmp.
+  if (NOT EXCLUDE_FROM_ALL)
+    # Register the testsuites and depends for the check-openmp rule.
+    set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_TESTSUITES ${ARG_UNPARSED_ARGUMENTS})
+    set_property(GLOBAL APPEND PROPERTY OPENMP_LIT_DEPENDS ${ARG_DEPENDS})
+  endif()
+
+  if (${OPENMP_STANDALONE_BUILD})
+    set(LIT_ARGS ${OPENMP_LIT_ARGS} ${ARG_ARGS})
+    add_custom_target(${target}
+      COMMAND ${PYTHON_EXECUTABLE} ${OPENMP_LLVM_LIT_EXECUTABLE} ${LIT_ARGS} ${ARG_UNPARSED_ARGUMENTS}
+      COMMENT ${comment}
+      DEPENDS ${ARG_DEPENDS}
+      ${cmake_3_2_USES_TERMINAL}
+    )
+  else()
+    add_lit_testsuite(${target}
+      ${comment}
+      ${ARG_UNPARSED_ARGUMENTS}
+      DEPENDS clang clang-resource-headers FileCheck ${ARG_DEPENDS}
+      ARGS ${ARG_ARGS}
+    )
+  endif()
+endfunction()
+
+function(construct_check_openmp_target)
+  get_property(OPENMP_LIT_TESTSUITES GLOBAL PROPERTY OPENMP_LIT_TESTSUITES)
+  get_property(OPENMP_LIT_DEPENDS GLOBAL PROPERTY OPENMP_LIT_DEPENDS)
+
+  # We already added the testsuites themselves, no need to do that again.
+  set(EXCLUDE_FROM_ALL True)
+  add_openmp_testsuite(check-openmp "Running OpenMP tests" ${OPENMP_LIT_TESTSUITES} DEPENDS ${OPENMP_LIT_DEPENDS})
+endfunction()
diff --git a/final/cmake/config-ix.cmake b/final/cmake/config-ix.cmake
new file mode 100644
index 0000000..13eace9
--- /dev/null
+++ b/final/cmake/config-ix.cmake
@@ -0,0 +1,7 @@
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+
+check_c_compiler_flag(-Werror OPENMP_HAVE_WERROR_FLAG)
+
+check_cxx_compiler_flag(-std=gnu++11 OPENMP_HAVE_STD_GNUPP11_FLAG)
+check_cxx_compiler_flag(-std=c++11 OPENMP_HAVE_STD_CPP11_FLAG)
diff --git a/final/libomptarget/CMakeLists.txt b/final/libomptarget/CMakeLists.txt
new file mode 100644
index 0000000..a953662
--- /dev/null
+++ b/final/libomptarget/CMakeLists.txt
@@ -0,0 +1,84 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build offloading library and related plugins.
+#
+##===----------------------------------------------------------------------===##
+
+if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+  message(FATAL_ERROR "Direct configuration not supported, please use parent directory!")
+endif()
+
+# Add cmake directory to search for custom cmake functions.
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH})
+
+if(OPENMP_STANDALONE_BUILD)
+  # Build all libraries into a common place so that tests can find them.
+  set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# Message utilities.
+include(LibomptargetUtils)
+
+# Get dependencies for the different components of the project.
+include(LibomptargetGetDependencies)
+
+# This is a list of all the targets that are supported/tested right now.
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
+set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
+
+# Once the plugins for the different targets are validated, they will be added to
+# the list of supported targets in the current system.
+set (LIBOMPTARGET_SYSTEM_TARGETS "")
+
+# Check whether using debug mode. In debug mode, allow dumping progress
+# messages at runtime by default. Otherwise, it can be enabled
+# independently using the LIBOMPTARGET_ENABLE_DEBUG option.
+string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE)
+if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" ON)
+else()
+  option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" OFF)
+endif()
+if(LIBOMPTARGET_ENABLE_DEBUG)
+  add_definitions(-DOMPTARGET_DEBUG)
+endif()
+
+include_directories(include)
+
+# Build target agnostic offloading library.
+add_subdirectory(src)
+
+# Retrieve the path to the resulting library so that it can be used for 
+# testing.
+get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY)
+if(NOT LIBOMPTARGET_LIBRARY_DIR)
+  set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# Definitions for testing, for reuse when testing libomptarget-nvptx.
+if(OPENMP_STANDALONE_BUILD)
+  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING
+    "Path to folder containing omp.h")
+  set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING
+    "Path to folder containing libomp.so")
+else()
+  set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src")
+endif()
+
+
+# Build offloading plugins and device RTLs if they are available.
+add_subdirectory(plugins)
+add_subdirectory(deviceRTLs)
+
+# Add tests.
+add_subdirectory(test)
diff --git a/final/libomptarget/README.txt b/final/libomptarget/README.txt
new file mode 100644
index 0000000..8c0a837
--- /dev/null
+++ b/final/libomptarget/README.txt
@@ -0,0 +1,73 @@
+
+    README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+    ======================================================================
+
+How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget)
+========================================================================
+In-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omptarget
+
+Out-of-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp (libomptarget lives under ./libomptarget)
+$ cd where-you-want-to-live/openmp/libomptarget
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+
+For details about building, please look at README.rst in the parent directory.
+
+Architectures Supported
+=======================
+The current library has been only tested in Linux operating system and the
+following host architectures:
+* Intel(R) 64 architecture
+* IBM(R) Power architecture (big endian)
+* IBM(R) Power architecture (little endian)
+* ARM(R) AArch64 architecture (little endian)
+
+The currently supported offloading device architectures are:
+* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes)
+* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
+* ARM(R) AArch64 architecture (little endian) (generic 64-bit plugin - mostly for testing purposes)
+* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures
+
+Supported RTL Build Configurations
+==================================
+Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8
+
+              ---------------------------
+              |   gcc      |   clang    |
+--------------|------------|------------|
+| Linux* OS   |  Yes(1)    |  Yes(2)    |
+-----------------------------------------
+
+(1) gcc version 4.8.2 or later is supported.
+(2) clang version 3.7 or later is supported.
+
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL:
+  - clang (from https://github.com/clang-ykt )
+  - clang (development branch at http://clang.llvm.org - several features still
+    under development)
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+This library and related compiler support is still under development, so the
+employed interface is likely to change in the future.
+
+*Other names and brands may be claimed as the property of others.
diff --git a/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
new file mode 100644
index 0000000..dbf8c38
--- /dev/null
+++ b/final/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -0,0 +1,192 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Try to detect in the system several dependencies required by the different
+# components of libomptarget. These are the dependencies we have:
+#
+# libelf : required by some targets to handle the ELF files at runtime.
+# libffi : required to launch target kernels given function and argument 
+#          pointers.
+# CUDA : required to control offloading to NVIDIA GPUs.
+
+include (FindPackageHandleStandardArgs)
+
+################################################################################
+# Looking for libelf...
+################################################################################
+
+find_path (
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR
+  NAMES
+    libelf.h
+  PATHS
+    /usr/include
+    /usr/local/include
+    /opt/local/include
+    /sw/include
+    ENV CPATH
+  PATH_SUFFIXES
+    libelf)
+
+find_library (
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+  NAMES
+    elf
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /opt/local/lib
+    /sw/lib
+    ENV LIBRARY_PATH
+    ENV LD_LIBRARY_PATH)
+    
+set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_LIBELF 
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS)
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBELF_LIBRARIES)
+  
+################################################################################
+# Looking for libffi...
+################################################################################
+find_package(PkgConfig)
+
+pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi)
+
+find_path (
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR
+  NAMES
+    ffi.h
+  HINTS
+    ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR}
+    ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS}
+  PATHS
+    /usr/include
+    /usr/local/include
+    /opt/local/include
+    /sw/include
+    ENV CPATH)
+
+# Don't bother look for the library if the header files were not found.
+if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR)
+  find_library (
+      LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+    NAMES
+      ffi
+    HINTS
+      ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR}
+      ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS}
+    PATHS
+      /usr/lib
+      /usr/local/lib
+      /opt/local/lib
+      /sw/lib
+      ENV LIBRARY_PATH
+      ENV LD_LIBRARY_PATH)
+endif()
+
+set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_LIBFFI 
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS)
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS 
+  LIBOMPTARGET_DEP_LIBFFI_LIBRARIES)
+  
+################################################################################
+# Looking for CUDA...
+################################################################################
+if (CUDA_TOOLKIT_ROOT_DIR)
+  set(LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET TRUE)
+endif()
+find_package(CUDA QUIET)
+
+set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND})
+set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+
+mark_as_advanced(
+  LIBOMPTARGET_DEP_CUDA_FOUND 
+  LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS)
+
+################################################################################
+# Looking for CUDA Driver API... (needed for CUDA plugin)
+################################################################################
+
+find_library (
+    LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
+  NAMES
+    cuda
+  PATHS
+    /lib64)
+
+# There is a libcuda.so in lib64/stubs that can be used for linking.
+if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND)
+  # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this
+  # case CUDA_LIBRARIES contains additional linker arguments which breaks
+  # get_filename_component below. Fortunately, since that change the module
+  # exports CUDA_cudart_static_LIBRARY which points to a single file in the
+  # right directory.
+  set(cuda_library ${CUDA_LIBRARIES})
+  if (DEFINED CUDA_cudart_static_LIBRARY)
+    set(cuda_library ${CUDA_cudart_static_LIBRARY})
+  endif()
+  get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY)
+  find_library (
+      LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES
+    NAMES
+      cuda
+    HINTS
+      "${CUDA_LIBDIR}/stubs")
+endif()
+
+find_package_handle_standard_args(
+  LIBOMPTARGET_DEP_CUDA_DRIVER
+  DEFAULT_MSG
+  LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
+
+mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES)
+
+################################################################################
+# Looking for CUDA libdevice subdirectory
+#
+# Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work
+# out of the box. More info on http://bugs.debian.org/882505
+################################################################################
+
+set(LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR nvvm/libdevice)
+
+# Don't alter CUDA_TOOLKIT_ROOT_DIR if the user specified it, if a value was
+# already cached for it, or if it already has libdevice.  Otherwise, on
+# Debian/Ubuntu, look where the nvidia-cuda-toolkit package normally installs
+# libdevice.
+if (NOT LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET AND
+    NOT EXISTS
+      "${CUDA_TOOLKIT_ROOT_DIR}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}")
+  find_program(LSB_RELEASE lsb_release)
+  if (LSB_RELEASE)
+    execute_process(COMMAND ${LSB_RELEASE} -is
+      OUTPUT_VARIABLE LSB_RELEASE_ID
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    set(candidate_dir /usr/lib/cuda)
+    if ((LSB_RELEASE_ID STREQUAL "Debian" OR LSB_RELEASE_ID STREQUAL "Ubuntu")
+        AND EXISTS "${candidate_dir}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}")
+      set(CUDA_TOOLKIT_ROOT_DIR "${candidate_dir}" CACHE PATH
+          "Toolkit location." FORCE)
+    endif()
+  endif()
+endif()
diff --git a/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
new file mode 100644
index 0000000..f03567a
--- /dev/null
+++ b/final/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -0,0 +1,111 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# We use the compiler and linker provided by the user, attempt to use the one
+# used to build libomptarget or just fail.
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
+else()
+  return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS "${llvm_link}")
+  # Use llvm-link from the compiler directory.
+  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
+else()
+  return()
+endif()
+
+function(try_compile_bitcode output source)
+  set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
+  file(WRITE ${srcfile} "${source}\n")
+  set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)
+
+  # The remaining arguments are the flags to be tested.
+  # FIXME: Don't hardcode GPU version. This is currently required because
+  #        Clang refuses to compile its default of sm_20 with CUDA 9.
+  execute_process(
+    COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
+      --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
+    RESULT_VARIABLE result
+    OUTPUT_QUIET ERROR_QUIET)
+  if (result EQUAL 0)
+    set(${output} TRUE PARENT_SCOPE)
+  else()
+    set(${output} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Save for which compiler we are going to do the following checks so that we
+# can discard cached values if the user specifies a different value.
+set(discard_cached FALSE)
+if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
+    NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
+  set(discard_cached TRUE)
+endif()
+set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)
+
+function(check_bitcode_compilation output source)
+  if (${discard_cached} OR NOT DEFINED ${output})
+    message(STATUS "Performing Test ${output}")
+    # Forward additional arguments which contain the flags.
+    try_compile_bitcode(result "${source}" ${ARGN})
+    set(${output} ${result} CACHE INTERNAL "" FORCE)
+    if(${result})
+      message(STATUS "Performing Test ${output} - Success")
+    else()
+      message(STATUS "Performing Test ${output} - Failed")
+    endif()
+  endif()
+endfunction()
+
+# These flags are required to emit LLVM Bitcode. We check them together because
+# if any of them are not supported, there is no point in finding out which are.
+set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
+
+# It makes no sense to continue given that the compiler doesn't support
+# emitting basic LLVM Bitcode
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
+  return()
+endif()
+
+set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})
+
+# Declaring external shared device variables might need an additional flag
+# since Clang 7.0 and was entirely unsupported since version 4.0.
+set(extern_device_shared_src "extern __device__ __shared__ int test;")
+
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
+  set(compiler_flag_fcuda_rdc -fcuda-rdc)
+  set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
+  check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
+
+  if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
+    return()
+  endif()
+
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
+endif()
+
+# We can compile LLVM Bitcode from CUDA source code!
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)
diff --git a/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake b/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake
new file mode 100644
index 0000000..7339cc0
--- /dev/null
+++ b/final/libomptarget/cmake/Modules/LibomptargetUtils.cmake
@@ -0,0 +1,27 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# void libomptarget_say(string message_to_user);
+# - prints out message_to_user
+macro(libomptarget_say message_to_user)
+  message(STATUS "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_warning_say(string message_to_user);
+# - prints out message_to_user with a warning
+macro(libomptarget_warning_say message_to_user)
+  message(WARNING "LIBOMPTARGET: ${message_to_user}")
+endmacro()
+
+# void libomptarget_error_say(string message_to_user);
+# - prints out message_to_user with an error and exits cmake
+macro(libomptarget_error_say message_to_user)
+  message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}")
+endmacro()
diff --git a/final/libomptarget/deviceRTLs/CMakeLists.txt b/final/libomptarget/deviceRTLs/CMakeLists.txt
new file mode 100644
index 0000000..9723fb8
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/CMakeLists.txt
@@ -0,0 +1,13 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ##===----------------------------------------------------------------------===##
+#
+# Build a device RTL for each available machine available.
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(nvptx)
diff --git a/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
new file mode 100644
index 0000000..c20339c
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -0,0 +1,185 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
+#
+##===----------------------------------------------------------------------===##
+
+set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
+  "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
+
+if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
+  find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
+  if(NOT ALTERNATE_CUDA_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
+  endif()
+  set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
+endif()
+
+# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
+# gcc.
+if(CUDA_HOST_COMPILER MATCHES clang)
+
+  find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
+
+  if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
+    libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
+    libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
+    return()
+  endif()
+  set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
+endif()
+
+if(LIBOMPTARGET_DEP_CUDA_FOUND)
+  libomptarget_say("Building CUDA offloading device RTL.")
+
+  # We really don't have any host code, so we don't need to care about
+  # propagating host flags.
+  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+  set(cuda_src_files
+      src/cancel.cu
+      src/critical.cu
+      src/data_sharing.cu
+      src/libcall.cu
+      src/loop.cu
+      src/omptarget-nvptx.cu
+      src/parallel.cu
+      src/reduction.cu
+      src/sync.cu
+      src/task.cu
+  )
+
+  set(omp_data_objects src/omp_data.cu)
+
+  # Get the compute capability the user requested or use SM_35 by default.
+  # SM_35 is what clang uses by default.
+  set(default_capabilities 35)
+  if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY)
+    set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
+    libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES")
+  endif()
+  set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING
+    "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
+  string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES})
+
+  foreach(sm ${nvptx_sm_list})
+    set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm})
+  endforeach()
+
+  # Activate RTL message dumps if requested by the user.
+  set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
+    "Activate NVPTX device RTL debug messages.")
+  if(${LIBOMPTARGET_NVPTX_DEBUG})
+    set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
+  endif()
+
+  # NVPTX runtime library has to be statically linked. Dynamic linking is not
+  # yet supported by the CUDA toolchain on the device.
+  set(BUILD_SHARED_LIBS OFF)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
+      OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
+
+  # Install device RTL under the lib destination folder.
+  install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+  target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
+
+
+  # Check if we can create an LLVM bitcode implementation of the runtime library
+  # that could be inlined in the user application. For that we need to find
+  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+  # an LLVM linker.
+  set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+    "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+  set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+    "Location of a linker capable of linking LLVM bitcode objects.")
+
+  include(LibomptargetNVPTXBitcodeLibrary)
+
+  set(bclib_default FALSE)
+  if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+    set(bclib_default TRUE)
+  endif()
+  set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL
+    "Enable CUDA LLVM bitcode offloading device RTL.")
+  if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
+    if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+      libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!")
+    endif()
+    libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+    # Set flags for LLVM Bitcode compilation.
+    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
+    if(${LIBOMPTARGET_NVPTX_DEBUG})
+      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+    else()
+      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+    endif()
+
+    # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
+    # to handle. Therefore, we use 'weak' instead. We are compiling only for the
+    # device, so it should be equivalent.
+    if(CUDA_VERSION_MAJOR GREATER 8)
+      set(bc_flags ${bc_flags} -Dnv_weak=weak)
+    endif()
+
+    # Create target to build all Bitcode libraries.
+    add_custom_target(omptarget-nvptx-bc)
+
+    # Generate a Bitcode library for all the compute capabilities the user requested.
+    foreach(sm ${nvptx_sm_list})
+      set(cuda_arch --cuda-gpu-arch=sm_${sm})
+
+      # Compile CUDA files to bitcode.
+      set(bc_files "")
+      foreach(src ${cuda_src_files})
+        get_filename_component(infile ${src} ABSOLUTE)
+        get_filename_component(outfile ${src} NAME)
+
+        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+            -c ${infile} -o ${outfile}-sm_${sm}.bc
+          DEPENDS ${infile}
+          IMPLICIT_DEPENDS CXX ${infile}
+          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
+          VERBATIM
+        )
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+
+        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
+      endforeach()
+
+      # Link to a bitcode library.
+      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
+          DEPENDS ${bc_files}
+          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
+      )
+      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
+
+      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
+      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
+
+      # Copy library to destination.
+      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
+                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+                         $<TARGET_FILE_DIR:omptarget-nvptx>)
+
+      # Install bitcode library under the lib destination folder.
+      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+    endforeach()
+  endif()
+
+  add_subdirectory(test)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
+endif()
diff --git a/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
new file mode 100644
index 0000000..989a01f
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
@@ -0,0 +1,523 @@
+
+**Design document for OpenMP reductions on the GPU** 
+
+//Abstract: //In this document we summarize the new design for an OpenMP
+implementation of reductions on NVIDIA GPUs.  This document comprises
+* a succinct background review,
+* an introduction to the decoupling of reduction algorithm and
+    data-structure-specific processing routines,
+* detailed illustrations of reduction algorithms used and
+* a brief overview of steps we have made beyond the last implementation.
+
+**Problem Review**
+
+Consider a typical OpenMP program with reduction pragma.
+
+```
+    double foo, bar;
+    #pragma omp parallel for reduction(+:foo, bar)
+    for (int i = 0; i < N; i++) {
+      foo+=A[i]; bar+=B[i];
+    }
+```
+where 'foo' and 'bar' are reduced across all threads in the parallel region.
+Our primary goal is to efficiently aggregate the values of foo and bar in
+such manner that
+* makes the compiler logically concise.
+* efficiently reduces within warps, threads, blocks and the device.
+
+**Introduction to Decoupling**
+In this section we address the problem of making the compiler
+//logically concise// by partitioning the task of reduction into two broad
+categories: data-structure specific routines and algorithmic routines.
+
+The previous reduction implementation was highly coupled with
+the specificity of the reduction element data structures (e.g., sizes, data
+types) and operators of the reduction (e.g., addition, multiplication). In
+our implementation we strive to decouple them. In our final implementations,
+we could remove all template functions in our runtime system.
+
+The (simplified) pseudo code generated by LLVM is as follows:
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, 
+               interWarpCpyFn)
+        where:
+        struct ReduceData {
+          double *foo;
+          double *bar;
+        } reduceData
+        reduceData.foo = &foo_p
+        reduceData.bar = &bar_p
+
+        shuffleReduceFn and interWarpCpyFn are two auxiliary functions
+        generated to aid the runtime performing algorithmic steps
+        while being data-structure agnostic about ReduceData.
+
+        In particular, shuffleReduceFn is a function that takes the following
+        inputs:
+        a. local copy of ReduceData
+        b. its lane_id
+        c. the offset of the lane_id which hosts a remote ReduceData
+                relative to the current one
+        d. an algorithm version paramter determining which reduction
+                algorithm to use.
+        This shuffleReduceFn retrieves the remote ReduceData through shuffle
+        intrinsics and  reduces, using the algorithm specified by the 4th
+        parameter, the local ReduceData and with the remote ReduceData element
+        wise, and places the resultant values into the local ReduceData.
+
+        Different reduction algorithms are implemented with different runtime
+        functions, but they all make calls to this same shuffleReduceFn to
+        perform the essential reduction step. Therefore, based on the 4th
+        parameter, this shuffleReduceFn will behave slightly differently to
+        cooperate with the runtime function to ensure correctness under
+        different circumstances.
+
+        InterWarpCpyFn, as the name suggests, is a function that copies data
+        across warps. Its function is to tunnel all the thread private
+        ReduceData that is already reduced within a warp to a lane in the first
+        warp with minimal shared memory footprint. This is an essential step to
+        prepare for the last step of a block reduction.
+
+        (Warp, block, device level reduction routines that utilize these
+        auxiliary functions will be discussed in the next section.)
+
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+```
+
+**Reduction Algorithms**
+
+On the warp level, we have three versions of the algorithms:
+
+1. Full Warp Reduction
+
+```
+gpu_regular_warp_reduce(void *reduce_data,
+                        kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+    ShuffleReduceFn(reduce_data, 0, offset, 0);
+}
+```
+ShuffleReduceFn is used here with lane_id set to 0 because it is not used
+therefore we save instructions by not retrieving lane_id from the corresponding
+special registers. The 4th parameters, which represents the version of the
+algorithm being used here, is set to 0 to signify full warp reduction.
+
+In this version specified (=0), the ShuffleReduceFn behaves, per element, as
+follows:
+
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+reduce_elem = reduce_elem @ remote_elem;
+
+```
+
+An illustration of this algorithm operating on a hypothetical 8-lane full-warp
+would be:
+{F74}
+The coloring invariant follows that elements with the same color will be
+combined and reduced in the next reduction step. As can be observed, no overhead
+is present, exactly log(2, N) steps are needed.
+
+2. Contiguous Full Warp Reduction
+```
+gpu_irregular_warp_reduce(void *reduce_data,
+                          kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
+                          int lane_id) {
+  int curr_size;
+  int offset;
+    curr_size = size;
+    mask = curr_size/2;
+    while (offset>0) {
+      ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+      curr_size = (curr_size+1)/2;
+      offset = curr_size/2;
+    }
+}
+```
+
+In this version specified (=1), the ShuffleReduceFn behaves, per element, as
+follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (lane_id < offset) {
+    reduce_elem = reduce_elem @ remote_elem
+} else {
+    reduce_elem = remote_elem
+}
+```
+
+An important invariant (also a restriction on the starting state of the
+reduction) is that this algorithm assumes that all unused ReduceData are
+located in a contiguous subset of threads in a warp starting from lane 0.
+
+With the presence of a trailing active lane with an odd-numbered lane
+id, its value will not be aggregated with any other lane. Therefore,
+in order to preserve the invariant, such ReduceData is copied to the first lane
+whose thread-local ReduceData has already being used in a previous reduction
+and would therefore be useless otherwise.
+
+An illustration of this algorithm operating on a hypothetical 8-lane partial
+warp woud be:
+{F75}
+
+As illustrated, this version of the algorithm introduces overhead whenever
+we have odd number of participating lanes in any reduction step to
+copy data between lanes.
+
+3. Dispersed Partial Warp Reduction
+```
+gpu_irregular_simt_reduce(void *reduce_data,
+                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  int size, remote_id;
+  int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
+  do {
+      remote_id = find_the_next_active_lane_id_right_after_me();
+      // the above function returns 0 of no active lane
+      // is present right after the current thread.
+      size = get_number_of_active_lanes_in_this_warp();
+      logical_lane_id /= 2;
+      ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+```
+
+There is no assumption made about the initial state of the reduction.
+Any number of lanes (>=1) could be active at any position. The reduction
+result is kept in the first active lane.
+
+In this version specified (=2), the ShuffleReduceFn behaves, per element, as
+follows:
+```
+//reduce_elem refers to an element in the local ReduceData
+//remote_elem is retrieved from a remote lane
+remote_elem = shuffle_down(reduce_elem, offset, 32);
+if (LaneId % 2 == 0 && Offset > 0) {
+    reduce_elem = reduce_elem @ remote_elem
+} else {
+    reduce_elem = remote_elem
+}
+```
+We will proceed with a brief explanation for some arguments passed in,
+it is important to notice that, in this section, we will introduce the
+concept of logical_lane_id, and it is important to distinguish it
+from physical lane_id as defined by nvidia.
+1. //logical_lane_id//: as the name suggests, it refers to the calculated
+    lane_id (instead of the physical one defined by nvidia) that would make
+    our algorithm logically concise. A thread with logical_lane_id k means
+    there are (k-1) threads before it.
+2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
+    id of the remote lane from which we will retrieve the ReduceData. We
+    subtract (threadIdx+1) from it because we would like to maintain only one
+    underlying shuffle intrinsic (which is used to communicate among lanes in a
+    warp). This particular version of shuffle intrinsic we take accepts only
+    offsets, instead of absolute lane_id. Therefore the subtraction is performed
+    on the absolute lane_id we calculated to obtain the offset.
+
+This algorithm is slightly different in 2 ways and it is not, conceptually, a
+generalization of the above algorithms.
+1. It reduces elements close to each other. For instance, values in the 0th lane
+    is to be combined with that of the 1st lane; values in the 2nd lane is to be
+    combined with that of the 3rd lane. We did not use the previous algorithm
+    where the first half of the (partial) warp is reduced with the second half
+    of the (partial) warp. This is because, the mapping
+    f(x): logical_lane_id -> physical_lane_id;
+    can be easily calculated whereas its inverse
+    f^-1(x): physical_lane_id -> logical_lane_id
+    cannot and performing such reduction requires the inverse to be known.
+2. Because this algorithm is agnostic about the positions of the lanes that are
+    active, we do not need to perform the coping step as in the second
+    algorithm.
+An illustrative run would look like
+{F76}
+As observed, overhead is high because in each and every step of reduction,
+logical_lane_id is recalculated; so is the remote_id.
+
+On a block level, we have implemented the following block reduce algorithm:
+
+```
+gpu_irregular_block_reduce(void *reduce_data,
+              kmp_ShuffleReductFctPtr shuflReduceFn,
+              kmp_InterWarpCopyFctPtr interWarpCpyFn,
+              int size) {
+
+  int wid = threadIdx.x/WARPSIZE;
+  int lane_id = threadIdx.x%WARPSIZE;
+
+  int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
+
+  unsigned tnum = __ballot(1);
+  int thread_num = __popc(tnum);
+
+    //full warp reduction
+    if (thread_num == WARPSIZE) {
+      gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
+    }
+    //partial warp reduction
+    if (thread_num < WARPSIZE) {
+        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
+                                  lane_id);
+    }
+    //Gather all the reduced values from each warp
+    //to the first warp
+    //named_barrier inside this function to ensure
+    //correctness. It is effectively a sync_thread
+    //that won't deadlock.
+    interWarpCpyFn(reduce_data, warp_needed);
+
+    //This is to reduce data gathered from each "warp master".
+    if (wid==0) {
+        gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
+                                  lane_id);
+    }
+
+  return;
+}
+```
+In this function, no ShuffleReduceFn is directly called as it makes calls
+to various versions of the warp-reduction functions. It first reduces
+ReduceData warp by warp; in the end, we end up with the number of
+ReduceData equal to the number of warps present in this thread
+block. We then proceed to gather all such ReduceData to the first warp.
+
+As observed, in this algorithm we make use of the function InterWarpCpyFn,
+which copies data from each of the "warp master" (0th lane of each warp, where 
+a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
+mathematical sense) the problem of reduction across warp masters in a block to
+the problem of warp reduction which we already have solutions to.
+
+We can thus completely avoid the use of atomics to reduce in a threadblock.
+
+**Efficient Cross Block Reduce**
+
+The next challenge is to reduce values across threadblocks.  We aim to do this
+without atomics or critical sections.
+
+Let a kernel be started with TB threadblocks.
+Let the GPU have S SMs.
+There can be at most N active threadblocks per SM at any time.
+
+Consider a threadblock tb (tb < TB) running on SM s (s < SM).  'tb' is one of
+at most 'N' active threadblocks on SM s.  Let each threadblock active on an SM
+be given an instance identifier id (0 <= id < N).  Therefore, the tuple (s, id)
+uniquely identifies an active threadblock on the GPU.
+
+To efficiently implement cross block reduce, we first allocate an array for
+each value to be reduced of size S*N (which is the maximum number of active
+threadblocks at any time on the device).
+
+Each threadblock reduces its value to slot [s][id].  This can be done without
+locking since no other threadblock can write to the same slot concurrently.
+
+As a final stage, we reduce the values in the array as follows:
+
+```
+// Compiler generated wrapper function for each target region with a reduction
+clause.
+target_function_wrapper(map_args, reduction_array)  <--- start with 1 team and 1
+   thread.
+  // Use dynamic parallelism to launch M teams, N threads as requested by the
+  user to execute the target region.
+
+  target_function<<M, N>>(map_args)
+
+  Reduce values in reduction_array
+
+```
+
+**Comparison with Last Version**
+
+
+The (simplified) pseudo code generated by LLVM on the host is as follows:
+
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+        where:
+        struct ReduceData {
+          double *foo;
+          double *bar;
+        } reduceData
+        reduceData.foo = &foo_p
+        reduceData.bar = &bar_p
+
+        reduceFn is a pointer to a function that takes in two inputs
+        of type ReduceData, "reduces" them element wise, and places the
+        result in the first input:
+        reduceFn(ReduceData *a, ReduceData *b)
+          a = a @ b
+
+        Every thread in the parallel region calls kmpc_reduce_nowait with
+        its private copy of reduceData.  The runtime reduces across the
+        threads (using tree reduction on the operator 'reduceFn?) and stores
+        the final result in the master thread if successful.
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 2:
+        In this case kmpc_reduce_nowait() could not use tree reduction,
+        so use atomics instead:
+        each thread atomically writes to foo
+        each thread atomically writes to bar
+```
+
+On a GPU, a similar reduction may need to be performed across SIMT threads,
+warps, and threadblocks.  The challenge is to do so efficiently in a fashion
+that is compatible with the LLVM OpenMP implementation.
+
+In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
+the salient steps of the code generated are as follows:
+
+
+```
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
+        status = can_block_reduce()
+        if status == 1:
+          reduce efficiently to thread 0 using shuffles and shared memory.
+          return 1
+        else
+          cannot use efficient block reduction, fallback to atomics
+          return 2
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 2:
+        In this case kmpc_reduce_nowait() could not use tree reduction,
+        so use atomics instead:
+        each thread atomically writes to foo
+        each thread atomically writes to bar
+```
+
+The function can_block_reduce() is defined as follows:
+
+
+```
+int32_t can_block_reduce() {
+  int tid = GetThreadIdInTeam();
+  int nt = GetNumberOfOmpThreads(tid);
+  if (nt != blockDim.x)
+    return 0;
+  unsigned tnum = __ballot(1);
+  if (tnum != (~0x0)) {
+    return 0;
+  }
+  return 1;
+}
+```
+
+This function permits the use of the efficient block reduction algorithm
+using shuffles and shared memory (return 1) only if (a) all SIMT threads in
+a warp are active (i.e., number of threads in the parallel region is a
+multiple of 32) and (b) the number of threads in the parallel region
+(set by the num_threads clause) equals blockDim.x.
+
+If either of these preconditions is not true, each thread in the threadblock
+updates the global value using atomics.
+
+Atomics and compare-and-swap operations are expensive on many threaded
+architectures such as GPUs and we must avoid them completely.
+
+
+**Appendix: Implementation Details**
+
+
+```
+// Compiler generated function.
+reduceFn(ReduceData *a, ReduceData *b)
+  a->foo = a->foo + b->foo
+  a->bar = a->bar + b->bar
+
+// Compiler generated function.
+swapAndReduceFn(ReduceData *thread_private, int lane)
+  ReduceData *remote = new ReduceData()
+  remote->foo = shuffle_double(thread_private->foo, lane)
+  remote->bar = shuffle_double(thread_private->bar, lane)
+  reduceFn(thread_private, remote)
+
+// OMP runtime function.
+warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
+  offset = 16
+  while (offset > 0)
+    swapAndReduceFn(thread_private, offset)
+    offset /= 2
+
+// OMP runtime function.
+warpReduce_irregular():
+  ...
+
+// OMP runtime function.
+kmpc_reduce_warp(reduceData, swapAndReduceFn)
+  if all_lanes_active:
+    warpReduce_regular(reduceData, swapAndReduceFn)
+  else:
+    warpReduce_irregular(reduceData, swapAndReduceFn)
+  if in_simd_region:
+    // all done, reduce to global in simd lane 0
+    return 1
+  else if in_parallel_region:
+    // done reducing to one value per warp, now reduce across warps
+    return 3
+
+// OMP runtime function; one for each basic type.
+kmpc_reduce_block_double(double *a)
+  if lane == 0:
+    shared[wid] = *a
+  named_barrier(1, num_threads)
+  if wid == 0
+    block_reduce(shared)
+  if lane == 0
+    *a = shared[0]
+  named_barrier(1, num_threads)
+  if wid == 0 and lane == 0
+    return 1  // write back reduced result
+  else
+    return 0  // don't do anything
+
+```
+
+
+
+```
+// Compiler generated code.
+    1. Create private copies of variables: foo_p, bar_p
+    2. Each thread reduces the chunk of A and B assigned to it and writes
+       to foo_p and bar_p respectively.
+    3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
+    4. if ret == 1:
+        The master thread stores the reduced result in the globals.
+        foo += reduceData.foo; bar += reduceData.bar
+    5. else if ret == 3:
+        ret = block_reduce_double(reduceData.foo)
+        if ret == 1:
+          foo += reduceData.foo
+        ret = block_reduce_double(reduceData.bar)
+        if ret == 1:
+          bar += reduceData.bar
+```
+
+**Notes**
+
+    1. This scheme requires that the CUDA OMP runtime can call llvm generated
+       functions. This functionality now works.
+    2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
+       (including calls through function pointers) are optimized away.
+    3. If we are reducing multiple to multiple variables in a parallel region,
+       the reduce operations are all performed in warpReduce_[ir]regular(). This
+       results in more instructions in the loop and should result in fewer
+       stalls due to data dependencies.  Unfortunately we cannot do the same in
+       kmpc_reduce_block_double() without increasing shared memory usage.
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu b/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu
new file mode 100644
index 0000000..93fc5da
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/cancel.cu
@@ -0,0 +1,27 @@
+//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used in the implementation of OpenMP cancel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
+                                        int32_t cancelVal) {
+  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
+  // disabled
+  return FALSE;
+}
+
+EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
+                             int32_t cancelVal) {
+  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
+  // disabled
+  return FALSE;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/critical.cu b/final/libomptarget/deviceRTLs/nvptx/src/critical.cu
new file mode 100644
index 0000000..2eb94f5
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/critical.cu
@@ -0,0 +1,29 @@
+//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of critical with KMPC interface
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdio.h>
+
+#include "omptarget-nvptx.h"
+
+EXTERN
+void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
+                     kmp_CriticalName *lck) {
+  PRINT0(LD_IO, "call to kmpc_critical()\n");
+  omp_set_lock((omp_lock_t *)lck);
+}
+
+EXTERN
+void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
+                         kmp_CriticalName *lck) {
+  PRINT0(LD_IO, "call to kmpc_end_critical()\n");
+  omp_unset_lock((omp_lock_t *)lck);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
new file mode 100644
index 0000000..50b8654
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -0,0 +1,581 @@
+//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of data sharing environments/
+//
+//===----------------------------------------------------------------------===//
+#include "omptarget-nvptx.h"
+#include <stdio.h>
+
+// Warp ID in the CUDA block
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+// Lane ID in the CUDA warp.
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
+
+// Return true if this is the first active thread in the warp.
+INLINE static bool IsWarpMasterActiveThread() {
+  unsigned long long Mask = __ACTIVEMASK();
+  unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
+  unsigned long long Sh = Mask << ShNum;
+  // Truncate Sh to the 32 lower bits
+  return (unsigned)Sh == 0;
+}
+// Return true if this is the master thread.
+INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
+  return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
+}
+
+/// Return the provided size aligned to the size of a pointer.
+INLINE static size_t AlignVal(size_t Val) {
+  const size_t Align = (size_t)sizeof(void *);
+  if (Val & (Align - 1)) {
+    Val += Align;
+    Val &= ~(Align - 1);
+  }
+  return Val;
+}
+
+#define DSFLAG 0
+#define DSFLAG_INIT 0
+#define DSPRINT(_flag, _str, _args...)                                         \
+  {                                                                            \
+    if (_flag) {                                                               \
+      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/          \
+    }                                                                          \
+  }
+#define DSPRINT0(_flag, _str)                                                  \
+  {                                                                            \
+    if (_flag) {                                                               \
+      /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/                 \
+    }                                                                          \
+  }
+
+// Initialize the shared data structures. This is expected to be called for the
+// master thread and warp masters. \param RootS: A pointer to the root of the
+// data sharing stack. \param InitialDataSize: The initial size of the data in
+// the slot.
+EXTERN void
+__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
+                                           size_t InitialDataSize) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  DSPRINT0(DSFLAG_INIT,
+           "Entering __kmpc_initialize_data_sharing_environment\n");
+
+  unsigned WID = getWarpId();
+  DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
+
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+  __kmpc_data_sharing_slot *RootS =
+      teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
+
+  DataSharingState.SlotPtr[WID] = RootS;
+  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+
+  // We don't need to initialize the frame and active threads.
+
+  DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
+  DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
+  DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
+          (unsigned long long)RootS->DataEnd);
+  DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
+          (unsigned long long)RootS->Next);
+  DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
+          (unsigned long long)DataSharingState.SlotPtr[WID]);
+  DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
+          (unsigned long long)DataSharingState.StackPtr[WID]);
+
+  DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
+}
+
+EXTERN void *__kmpc_data_sharing_environment_begin(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, int32_t *SavedActiveThreads,
+    size_t SharingDataSize, size_t SharingDefaultDataSize,
+    int16_t IsOMPRuntimeInitialized) {
+
+  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
+
+  // If the runtime has been elided, used __shared__ memory for master-worker
+  // data sharing.
+  if (!IsOMPRuntimeInitialized)
+    return (void *)&DataSharingState;
+
+  DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+  DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+          (unsigned long long)SharingDefaultDataSize);
+
+  unsigned WID = getWarpId();
+  unsigned CurActiveThreads = __ACTIVEMASK();
+
+  __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+  void *&StackP = DataSharingState.StackPtr[WID];
+  void * volatile &FrameP = DataSharingState.FramePtr[WID];
+  int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+  DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
+  // Save the current values.
+  *SavedSharedSlot = SlotP;
+  *SavedSharedStack = StackP;
+  *SavedSharedFrame = FrameP;
+  *SavedActiveThreads = ActiveT;
+
+  DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
+  DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
+  DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
+  DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
+  DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
+
+  // Only the warp active master needs to grow the stack.
+  if (IsWarpMasterActiveThread()) {
+    // Save the current active threads.
+    ActiveT = CurActiveThreads;
+
+    // Make sure we use aligned sizes to avoid rematerialization of data.
+    SharingDataSize = AlignVal(SharingDataSize);
+    // FIXME: The default data size can be assumed to be aligned?
+    SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
+
+    // Check if we have room for the data in the current slot.
+    const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
+    const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
+    const uintptr_t RequiredEndAddress =
+        CurrentStartAddress + (uintptr_t)SharingDataSize;
+
+    DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+    DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+            (unsigned long long)SharingDefaultDataSize);
+    DSPRINT(DSFLAG, "Current Start Address %016llx\n",
+            (unsigned long long)CurrentStartAddress);
+    DSPRINT(DSFLAG, "Current End Address %016llx\n",
+            (unsigned long long)CurrentEndAddress);
+    DSPRINT(DSFLAG, "Required End Address %016llx\n",
+            (unsigned long long)RequiredEndAddress);
+    DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
+
+    // If we require a new slot, allocate it and initialize it (or attempt to
+    // reuse one). Also, set the shared stack and slot pointers to the new
+    // place. If we do not need to grow the stack, just adapt the stack and
+    // frame pointers.
+    if (CurrentEndAddress < RequiredEndAddress) {
+      size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
+                           ? SharingDataSize
+                           : SharingDefaultDataSize;
+      __kmpc_data_sharing_slot *NewSlot = 0;
+
+      // Attempt to reuse an existing slot.
+      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+        uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+                                     (uintptr_t)(&ExistingSlot->Data[0]);
+        if (ExistingSlotSize >= NewSize) {
+          DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
+                  (unsigned long long)ExistingSlot);
+          NewSlot = ExistingSlot;
+        } else {
+          DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
+                  (unsigned long long)SlotP->Next);
+          free(ExistingSlot);
+        }
+      }
+
+      if (!NewSlot) {
+        NewSlot = (__kmpc_data_sharing_slot *)malloc(
+            sizeof(__kmpc_data_sharing_slot) + NewSize);
+        DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
+                (unsigned long long)NewSlot, NewSize);
+      }
+
+      NewSlot->Next = 0;
+      NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+      SlotP->Next = NewSlot;
+      SlotP = NewSlot;
+      StackP = &NewSlot->Data[SharingDataSize];
+      FrameP = &NewSlot->Data[0];
+    } else {
+
+      // Clean up any old slot that we may still have. The slot producers, do
+      // not eliminate them because that may be used to return data.
+      if (SlotP->Next) {
+        DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
+                (unsigned long long)SlotP->Next);
+        free(SlotP->Next);
+        SlotP->Next = 0;
+      }
+
+      FrameP = StackP;
+      StackP = (void *)RequiredEndAddress;
+    }
+  }
+
+  // FIXME: Need to see the impact of doing it here.
+  __threadfence_block();
+
+  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
+
+  // All the threads in this warp get the frame they should work with.
+  return FrameP;
+}
+
+EXTERN void __kmpc_data_sharing_environment_end(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, int32_t *SavedActiveThreads,
+    int32_t IsEntryPoint) {
+
+  DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
+
+  unsigned WID = getWarpId();
+
+  if (IsEntryPoint) {
+    if (IsWarpMasterActiveThread()) {
+      DSPRINT0(DSFLAG, "Doing clean up\n");
+
+      // The master thread cleans the saved slot, because this is an environment
+      // only for the master.
+      __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
+                                        ? *SavedSharedSlot
+                                        : DataSharingState.SlotPtr[WID];
+
+      if (S->Next) {
+        free(S->Next);
+        S->Next = 0;
+      }
+    }
+
+    DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
+    return;
+  }
+
+  int32_t CurActive = __ACTIVEMASK();
+
+  // Only the warp master can restore the stack and frame information, and only
+  // if there are no other threads left behind in this environment (i.e. the
+  // warp diverged and returns in different places). This only works if we
+  // assume that threads will converge right after the call site that started
+  // the environment.
+  if (IsWarpMasterActiveThread()) {
+    int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
+
+    DSPRINT0(DSFLAG, "Before restoring the stack\n");
+    // Zero the bits in the mask. If it is still different from zero, then we
+    // have other threads that will return after the current ones.
+    ActiveT &= ~CurActive;
+
+    DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
+            (unsigned)CurActive, (unsigned)ActiveT);
+
+    if (!ActiveT) {
+      // No other active threads? Great, lets restore the stack.
+
+      __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+      void *&StackP = DataSharingState.StackPtr[WID];
+      void * volatile &FrameP = DataSharingState.FramePtr[WID];
+
+      SlotP = *SavedSharedSlot;
+      StackP = *SavedSharedStack;
+      FrameP = *SavedSharedFrame;
+      ActiveT = *SavedActiveThreads;
+
+      DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
+              (unsigned long long)SlotP);
+      DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
+              (unsigned long long)StackP);
+      DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
+              (unsigned long long)FrameP);
+      DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
+    }
+  }
+
+  // FIXME: Need to see the impact of doing it here.
+  __threadfence_block();
+
+  DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
+  return;
+}
+
+EXTERN void *
+__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+                                          int16_t IsOMPRuntimeInitialized) {
+  DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
+
+  // If the runtime has been elided, use __shared__ memory for master-worker
+  // data sharing.  We're reusing the statically allocated data structure
+  // that is used for standard data sharing.
+  if (!IsOMPRuntimeInitialized)
+    return (void *)&DataSharingState;
+
+  // Get the frame used by the requested thread.
+
+  unsigned SourceWID = SourceThreadID / WARPSIZE;
+
+  DSPRINT(DSFLAG, "Source  warp: %u\n", SourceWID);
+
+  void * volatile P = DataSharingState.FramePtr[SourceWID];
+  DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
+  return P;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE static void data_sharing_init_stack_common() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+
+  for (int WID = 0; WID < WARPSIZE; WID++) {
+    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  }
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called only by the MASTER thread of each
+// team in non-SPMD mode.
+EXTERN void __kmpc_data_sharing_init_stack() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  data_sharing_init_stack_common();
+  omptarget_nvptx_globalArgs.Init();
+}
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization). This function is called in SPMD mode only.
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  if (threadIdx.x == 0)
+    data_sharing_init_stack_common();
+
+  __threadfence_block();
+}
+
+INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  // Only warp active master threads manage the stack.
+  bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
+
+  // Add worst-case padding to DataSize so that future stack allocations are
+  // correctly aligned.
+  const size_t Alignment = 8;
+  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
+
+  // Frame pointer must be visible to all workers in the same warp.
+  const unsigned WID = getWarpId();
+  void *FrameP = 0;
+  int32_t CurActive = __ACTIVEMASK();
+
+  if (IsWarpMaster) {
+    // SlotP will point to either the shared memory slot or an existing
+    // global memory slot.
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+    void *&StackP = DataSharingState.StackPtr[WID];
+
+    // Check if we have room for the data in the current slot.
+    const uintptr_t StartAddress = (uintptr_t)StackP;
+    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
+    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
+
+    // If we requested more data than there is room for in the rest
+    // of the slot then we need to either re-use the next slot, if one exists,
+    // or create a new slot.
+    if (EndAddress < RequestedEndAddress) {
+      __kmpc_data_sharing_slot *NewSlot = 0;
+      size_t NewSize = PushSize;
+
+      // Allocate at least the default size for each type of slot.
+      // Master is a special case and even though there is only one thread,
+      // it can share more things with the workers. For uniformity, it uses
+      // the full size of a worker warp slot.
+      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
+      if (DefaultSlotSize > NewSize)
+        NewSize = DefaultSlotSize;
+      NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
+          sizeof(__kmpc_data_sharing_slot) + NewSize,
+          "Global memory slot allocation.");
+
+      NewSlot->Next = 0;
+      NewSlot->Prev = SlotP;
+      NewSlot->PrevSlotStackPtr = StackP;
+      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
+
+      // Make previous slot point to the newly allocated slot.
+      SlotP->Next = NewSlot;
+      // The current slot becomes the new slot.
+      SlotP = NewSlot;
+      // The stack pointer always points to the next free stack frame.
+      StackP = &NewSlot->Data[0] + PushSize;
+      // The frame pointer always points to the beginning of the frame.
+      FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
+    } else {
+      // Add the data chunk to the current slot. The frame pointer is set to
+      // point to the start of the new frame held in StackP.
+      FrameP = DataSharingState.FramePtr[WID] = StackP;
+      // Reset stack pointer to the requested address.
+      StackP = (void *)RequestedEndAddress;
+    }
+  }
+  // Get address from lane 0.
+  ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
+  if (sizeof(FrameP) == 8)
+    ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
+
+  return FrameP;
+}
+
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+                                                      int16_t UseSharedMemory) {
+  return data_sharing_push_stack_common(DataSize);
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
+                                            int16_t UseSharedMemory) {
+  // Compute the total memory footprint of the requested data.
+  // The master thread requires a stack only for itself. A worker
+  // thread (which at this point is a warp master) will require
+  // space for the variables of each thread in the warp,
+  // i.e. one DataSize chunk per warp lane.
+  // TODO: change WARPSIZE to the number of active threads in the warp.
+  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
+                        ? DataSize
+                        : WARPSIZE * DataSize;
+
+  // Compute the start address of the frame of each thread in the warp.
+  uintptr_t FrameStartAddress =
+      (uintptr_t) data_sharing_push_stack_common(PushSize);
+  FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
+  return (void *)FrameStartAddress;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  __threadfence_block();
+
+  if (GetThreadIdInBlock() % WARPSIZE == 0) {
+    unsigned WID = getWarpId();
+
+    // Current slot
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+
+    // Pointer to next available stack.
+    void *&StackP = DataSharingState.StackPtr[WID];
+
+    // Pop the frame.
+    StackP = FrameStart;
+
+    // If the current slot is empty, we need to free the slot after the
+    // pop.
+    bool SlotEmpty = (StackP == &SlotP->Data[0]);
+
+    if (SlotEmpty && SlotP->Prev) {
+      // Before removing the slot we need to reset StackP.
+      StackP = SlotP->PrevSlotStackPtr;
+
+      // Remove the slot.
+      SlotP = SlotP->Prev;
+      SafeFree(SlotP->Next, "Free slot.");
+      SlotP->Next = 0;
+    }
+  }
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+  omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// This function is used to init static memory manager. This manager is used to
+// manage statically allocated global memory. This memory is allocated by the
+// compiler and used to correctly implement globalization of the variables in
+// target, teams and distribute regions.
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+                                          const void *buf, size_t size,
+                                          int16_t is_shared,
+                                          const void **frame) {
+  if (is_shared) {
+    *frame = buf;
+    return;
+  }
+  if (isSPMDExecutionMode) {
+    if (GetThreadIdInBlock() == 0) {
+      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+    }
+    // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
+    __SYNCTHREADS();
+    return;
+  }
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "Must be called only in the target master thread.");
+  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+  __threadfence();
+}
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+                                              int16_t is_shared) {
+  if (is_shared)
+    return;
+  if (isSPMDExecutionMode) {
+    // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
+    __SYNCTHREADS();
+    if (GetThreadIdInBlock() == 0) {
+      omptarget_nvptx_simpleMemoryManager.Release();
+    }
+    return;
+  }
+  __threadfence();
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "Must be called only in the target master thread.");
+  omptarget_nvptx_simpleMemoryManager.Release();
+}
+
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/debug.h b/final/libomptarget/deviceRTLs/nvptx/src/debug.h
new file mode 100644
index 0000000..f2fcc1d
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -0,0 +1,288 @@
+//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+//   Usage guide
+//
+//   PRINT0(flag, str)        : if debug flag is on, print (no arguments)
+//   PRINT(flag, str, args)   : if debug flag is on, print (arguments)
+//   DON(flag)                : return true if debug flag is on
+//
+//   ASSERT(flag, cond, str, args): if test flag is on, test the condition
+//                                  if the condition is false, print str+args
+//          and assert.
+//          CAUTION: cond may be evaluate twice
+//   AON(flag)                     : return true if test flag is on
+//
+//   WARNING(flag, str, args)      : if warning flag is on, print the warning
+//   WON(flag)                     : return true if warning flag is on
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_NVPTX_DEBUG_H_
+#define _OMPTARGET_NVPTX_DEBUG_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of debugging
+////////////////////////////////////////////////////////////////////////////////
+
+#define LD_SET_NONE 0ULL /* none */
+#define LD_SET_ALL -1ULL /* all */
+
+// pos 1
+#define LD_SET_LOOP 0x1ULL  /* basic loop */
+#define LD_SET_LOOPD 0x2ULL /* basic loop */
+#define LD_SET_PAR 0x4ULL   /* basic parallel */
+#define LD_SET_PARD 0x8ULL  /* basic parallel */
+
+// pos 2
+#define LD_SET_SYNC 0x10ULL  /* sync info */
+#define LD_SET_SYNCD 0x20ULL /* sync info */
+#define LD_SET_WAIT 0x40ULL  /* state when waiting */
+#define LD_SET_TASK 0x80ULL  /* print task info (high level) */
+
+// pos 3
+#define LD_SET_IO 0x100ULL     /* big region io (excl atomic) */
+#define LD_SET_IOD 0x200ULL    /* big region io (excl atomic) */
+#define LD_SET_ENV 0x400ULL    /* env info */
+#define LD_SET_CANCEL 0x800ULL /* print cancel info */
+
+// pos 4
+#define LD_SET_MEM 0x1000ULL /* malloc / free */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags to print selected output.
+
+// these are some examples of possible definitions that can be used for
+// debugging.
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
+// on cuda buffer
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
+//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
+
+#ifndef OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
+#elif OMPTARGET_NVPTX_DEBUG
+#warning debug is used, not good for measurements
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of asserts
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LT_SET_NONE 0x0 /* unsafe */
+#define LT_SET_SAFETY                                                          \
+  0x1 /* check malloc type of stuff, input at creation, cheap */
+#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
+#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#ifndef OMPTARGET_NVPTX_TEST
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
+#else
+#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// set desired level of warnings
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// available flags
+
+#define LW_SET_ALL -1
+#define LW_SET_NONE 0x0
+#define LW_SET_ENV 0x1
+#define LW_SET_INPUT 0x2
+#define LW_SET_FUSSY 0x4
+
+////////////////////////////////////////////////////////////////////////////////
+// set the desired flags
+
+#if OMPTARGET_NVPTX_DEBUG
+#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
+#else
+#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// implemtation for debug
+////////////////////////////////////////////////////////////////////////////////
+
+#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
+#include <stdio.h>
+#include "option.h"
+
+template <typename... Arguments>
+NOINLINE static void log(const char *fmt, Arguments... parameters) {
+  printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
+         (int)(threadIdx.x & 0x1F), parameters...);
+}
+
+#endif
+#if OMPTARGET_NVPTX_TEST
+#include <assert.h>
+
+template <typename... Arguments>
+NOINLINE static void check(bool cond, const char *fmt,
+                           Arguments... parameters) {
+  if (!cond)
+    printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
+           (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
+           parameters...);
+  assert(cond);
+}
+
+NOINLINE static void check(bool cond) { assert(cond); }
+#endif
+
+// set flags that are tested (inclusion properties)
+
+#define LD_ALL (LD_SET_ALL)
+
+#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
+#define LD_LOOPD (LD_SET_LOOPD)
+#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
+#define LD_PARD (LD_SET_PARD)
+
+// pos 2
+#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
+#define LD_SYNCD (LD_SET_SYNCD)
+#define LD_WAIT (LD_SET_WAIT)
+#define LD_TASK (LD_SET_TASK)
+
+// pos 3
+#define LD_IO (LD_SET_IO | LD_SET_IOD)
+#define LD_IOD (LD_SET_IOD)
+#define LD_ENV (LD_SET_ENV)
+#define LD_CANCEL (LD_SET_CANCEL)
+
+// pos 3
+#define LD_MEM (LD_SET_MEM)
+
+// implement
+#if OMPTARGET_NVPTX_DEBUG
+
+#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
+
+#define PRINT0(_flag, _str)                                                    \
+  {                                                                            \
+    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
+      log("<b %2d, t %4d, w %2d, l %2d>: " _str);                              \
+    }                                                                          \
+  }
+
+#define PRINT(_flag, _str, _args...)                                           \
+  {                                                                            \
+    if (omptarget_device_environment.debug_level && DON(_flag)) {              \
+      log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args);                       \
+    }                                                                          \
+  }
+#else
+
+#define DON(_flag) (FALSE)
+#define PRINT0(flag, str)
+#define PRINT(flag, str, _args...)
+
+#endif
+
+// for printing without worring about precision, pointers...
+#define P64(_x) ((unsigned long long)(_x))
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for test
+////////////////////////////////////////////////////////////////////////////////
+
+#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
+#define LT_FUSSY (LT_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str)                                            \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check(_cond);                                                            \
+    }                                                                          \
+  }
+#define ASSERT(_flag, _cond, _str, _args...)                                   \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check(_cond);                                                            \
+    }                                                                          \
+  }
+
+#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
+
+#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
+#define ASSERT0(_flag, _cond, _str)                                            \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n");       \
+    }                                                                          \
+  }
+#define ASSERT(_flag, _cond, _str, _args...)                                   \
+  {                                                                            \
+    if (TON(_flag)) {                                                          \
+      check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n",        \
+            _args);                                                            \
+    }                                                                          \
+  }
+
+#else
+
+#define TON(_flag) (FALSE)
+#define ASSERT0(_flag, _cond, _str)
+#define ASSERT(_flag, _cond, _str, _args...)
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// early defs for warning
+
+#define LW_ALL (LW_SET_ALL)
+#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
+#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
+#define LW_FUSSY (LW_SET_FUSSY)
+
+#if OMPTARGET_NVPTX_WARNING
+
+#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
+#define WARNING0(_flag, _str)                                                  \
+  {                                                                            \
+    if (WON(_flag)) {                                                          \
+      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str);                      \
+    }                                                                          \
+  }
+#define WARNING(_flag, _str, _args...)                                         \
+  {                                                                            \
+    if (WON(_flag)) {                                                          \
+      log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args);               \
+    }                                                                          \
+  }
+
+#else
+
+#define WON(_flag) (FALSE)
+#define WARNING0(_flag, _str)
+#define WARNING(_flag, _str, _args...)
+
+#endif
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/interface.h b/final/libomptarget/deviceRTLs/nvptx/src/interface.h
new file mode 100644
index 0000000..b2a13a4
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -0,0 +1,532 @@
+//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug macros to be used in the application.
+//
+//  This file contains all the definitions that are relevant to
+//  the interface. The first section contains the interface as
+//  declared by OpenMP.  The second section includes the compiler
+//  specific interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _INTERFACES_H_
+#define _INTERFACES_H_
+
+#include "option.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP interface
+////////////////////////////////////////////////////////////////////////////////
+
+typedef uint32_t omp_lock_t;      /* arbitrary type of the right length */
+typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
+
+typedef enum omp_sched_t {
+  omp_sched_static = 1,  /* chunkSize >0 */
+  omp_sched_dynamic = 2, /* chunkSize >0 */
+  omp_sched_guided = 3,  /* chunkSize >0 */
+  omp_sched_auto = 4,    /* no chunkSize */
+} omp_sched_t;
+
+typedef enum omp_proc_bind_t {
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+EXTERN double omp_get_wtick(void);
+EXTERN double omp_get_wtime(void);
+
+EXTERN void omp_set_num_threads(int num);
+EXTERN int omp_get_num_threads(void);
+EXTERN int omp_get_max_threads(void);
+EXTERN int omp_get_thread_limit(void);
+EXTERN int omp_get_thread_num(void);
+EXTERN int omp_get_num_procs(void);
+EXTERN int omp_in_parallel(void);
+EXTERN int omp_in_final(void);
+EXTERN void omp_set_dynamic(int flag);
+EXTERN int omp_get_dynamic(void);
+EXTERN void omp_set_nested(int flag);
+EXTERN int omp_get_nested(void);
+EXTERN void omp_set_max_active_levels(int level);
+EXTERN int omp_get_max_active_levels(void);
+EXTERN int omp_get_level(void);
+EXTERN int omp_get_active_level(void);
+EXTERN int omp_get_ancestor_thread_num(int level);
+EXTERN int omp_get_team_size(int level);
+
+EXTERN void omp_init_lock(omp_lock_t *lock);
+EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_destroy_lock(omp_lock_t *lock);
+EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_set_lock(omp_lock_t *lock);
+EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
+EXTERN void omp_unset_lock(omp_lock_t *lock);
+EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
+EXTERN int omp_test_lock(omp_lock_t *lock);
+EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
+EXTERN omp_proc_bind_t omp_get_proc_bind(void);
+EXTERN int omp_get_cancellation(void);
+EXTERN void omp_set_default_device(int deviceId);
+EXTERN int omp_get_default_device(void);
+EXTERN int omp_get_num_devices(void);
+EXTERN int omp_get_num_teams(void);
+EXTERN int omp_get_team_num(void);
+EXTERN int omp_is_initial_device(void);
+EXTERN int omp_get_initial_device(void);
+EXTERN int omp_get_max_task_priority(void);
+
+////////////////////////////////////////////////////////////////////////////////
+// file below is swiped from kmpc host interface
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// kmp specifc types
+////////////////////////////////////////////////////////////////////////////////
+
+typedef enum kmp_sched_t {
+  kmp_sched_static_chunk = 33,
+  kmp_sched_static_nochunk = 34,
+  kmp_sched_dynamic = 35,
+  kmp_sched_guided = 36,
+  kmp_sched_runtime = 37,
+  kmp_sched_auto = 38,
+
+  kmp_sched_static_balanced_chunk = 45,
+
+  kmp_sched_static_ordered = 65,
+  kmp_sched_static_nochunk_ordered = 66,
+  kmp_sched_dynamic_ordered = 67,
+  kmp_sched_guided_ordered = 68,
+  kmp_sched_runtime_ordered = 69,
+  kmp_sched_auto_ordered = 70,
+
+  kmp_sched_distr_static_chunk = 91,
+  kmp_sched_distr_static_nochunk = 92,
+  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
+
+  kmp_sched_default = kmp_sched_static_nochunk,
+  kmp_sched_unordered_first = kmp_sched_static_chunk,
+  kmp_sched_unordered_last = kmp_sched_auto,
+  kmp_sched_ordered_first = kmp_sched_static_ordered,
+  kmp_sched_ordered_last = kmp_sched_auto_ordered,
+  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
+  kmp_sched_distribute_last =
+      kmp_sched_distr_static_chunk_sched_static_chunkone,
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+   * Since we need to distinguish the three possible cases (no modifier,
+   * monotonic modifier, nonmonotonic modifier), we need separate bits for
+   * each modifier. The absence of monotonic does not imply nonmonotonic,
+   * especially since 4.5 says that the behaviour of the "no modifier" case
+   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
+   *
+   * Since we're passing a full 32 bit value, we can use a couple of high
+   * bits for these flags; out of paranoia we avoid the sign bit.
+   *
+   * These modifiers can be or-ed into non-static schedules by the compiler
+   * to pass the additional information. They will be stripped early in the
+   * processing in __kmp_dispatch_init when setting up schedules, so
+   * most of the code won't ever see schedules with these bits set.
+   */
+  kmp_sched_modifier_monotonic = (1 << 29),
+  /**< Set if the monotonic schedule modifier was present */
+  kmp_sched_modifier_nonmonotonic = (1 << 30),
+/**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum kmp_sched_t)(                                                          \
+      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
+  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
+   0)
+
+} kmp_sched_t;
+
+/*!
+ * Enum for accesseing the reserved_2 field of the ident_t struct below.
+ */
+enum {
+  /*! Bit set to 1 when in SPMD mode. */
+  KMP_IDENT_SPMD_MODE = 0x01,
+  /*! Bit set to 1 when a simplified runtime is used. */
+  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
+};
+
+/*!
+ * The ident structure that describes a source location.
+ * The struct is identical to the one in the kmp.h file.
+ * We maintain the same data structure for compatibility.
+ */
+typedef int kmp_int32;
+typedef struct ident {
+  kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
+  kmp_int32 flags; /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                      identifies this union member  */
+  kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
+  kmp_int32 reserved_3; /**<  source[4] in Fortran, do not use for C++  */
+  char const *psource; /**<  String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
+} ident_t;
+
+// parallel defs
+typedef ident_t kmp_Ident;
+typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
+typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
+typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
+typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
+                                        int16_t lane_offset,
+                                        int16_t shortCircuit);
+typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
+                                           int32_t index, int32_t width);
+typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
+                                     int32_t index, int32_t width,
+                                     int32_t reduce);
+typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
+
+// task defs
+typedef struct kmp_TaskDescr kmp_TaskDescr;
+typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
+typedef struct kmp_TaskDescr {
+  void *sharedPointerTable;   // ptr to a table of shared var ptrs
+  kmp_TaskFctPtr sub;         // task subroutine
+  int32_t partId;             // unused
+  kmp_TaskFctPtr destructors; // destructor of c++ first private
+} kmp_TaskDescr;
+
+// sync defs
+typedef int32_t kmp_CriticalName[8];
+
+////////////////////////////////////////////////////////////////////////////////
+// external interface
+////////////////////////////////////////////////////////////////////////////////
+
+// parallel
+EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
+EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
+                                    int32_t num_threads);
+// simd
+EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t simd_limit);
+// aee ... not supported
+// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr
+// microtask, ...);
+EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
+                                           uint32_t global_tid);
+EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid);
+
+// proc bind
+EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid,
+                                  int proc_bind);
+EXTERN int omp_get_num_places(void);
+EXTERN int omp_get_place_num_procs(int place_num);
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
+EXTERN int omp_get_place_num(void);
+EXTERN int omp_get_partition_num_places(void);
+EXTERN void omp_get_partition_place_nums(int *place_nums);
+
+// for static (no chunk or chunk)
+EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t sched, int32_t *plastiter,
+                                     int32_t *plower, int32_t *pupper,
+                                     int32_t *pstride, int32_t incr,
+                                     int32_t chunk);
+EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t sched, int32_t *plastiter,
+                                      uint32_t *plower, uint32_t *pupper,
+                                      int32_t *pstride, int32_t incr,
+                                      int32_t chunk);
+EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t sched, int32_t *plastiter,
+                                     int64_t *plower, int64_t *pupper,
+                                     int64_t *pstride, int64_t incr,
+                                     int64_t chunk);
+EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t sched, int32_t *plastiter1,
+                                      uint64_t *plower, uint64_t *pupper,
+                                      int64_t *pstride, int64_t incr,
+                                      int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t sched, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t sched, int32_t *plastiter,
+                                           uint32_t *plower, uint32_t *pupper,
+                                           int32_t *pstride, int32_t incr,
+                                           int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t sched, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t sched, int32_t *plastiter1,
+                                           uint64_t *plower, uint64_t *pupper,
+                                           int64_t *pstride, int64_t incr,
+                                           int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc,
+                                             int32_t global_tid, int32_t sched,
+                                             int32_t *plastiter,
+                                             int32_t *plower, int32_t *pupper,
+                                             int32_t *pstride, int32_t incr,
+                                             int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk);
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc,
+                                             int32_t global_tid, int32_t sched,
+                                             int32_t *plastiter,
+                                             int64_t *plower, int64_t *pupper,
+                                             int64_t *pstride, int64_t incr,
+                                             int64_t chunk);
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk);
+
+EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid);
+
+// for dynamic
+EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t sched, int32_t lower, int32_t upper,
+                                   int32_t incr, int32_t chunk);
+EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid,
+                                    int32_t sched, uint32_t lower,
+                                    uint32_t upper, int32_t incr,
+                                    int32_t chunk);
+EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t sched, int64_t lower, int64_t upper,
+                                   int64_t incr, int64_t chunk);
+EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid,
+                                    int32_t sched, uint64_t lower,
+                                    uint64_t upper, int64_t incr,
+                                    int64_t chunk);
+
+EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid,
+                                  int32_t *plastiter, int32_t *plower,
+                                  int32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t *plastiter, uint32_t *plower,
+                                   uint32_t *pupper, int32_t *pstride);
+EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid,
+                                  int32_t *plastiter, int64_t *plower,
+                                  int64_t *pupper, int64_t *pstride);
+EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid,
+                                   int32_t *plastiter, uint64_t *plower,
+                                   uint64_t *pupper, int64_t *pstride);
+
+EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
+
+// Support for reducing conditional lastprivate variables
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
+                                                  int32_t global_tid,
+                                                  int32_t varNum, void *array);
+
+// reduction
+EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
+EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
+    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
+    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
+    kmp_ListGlobalFctPtr glredFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
+                                                       int32_t global_tid,
+                                                       kmp_CriticalName *crit);
+EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc,
+                                                        int32_t global_tid,
+                                                        kmp_CriticalName *crit);
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+
+// sync barrier
+EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
+EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
+
+// single
+EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid);
+
+// sync
+EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid);
+EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
+                            kmp_CriticalName *crit);
+EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
+                                kmp_CriticalName *crit);
+EXTERN void __kmpc_flush(kmp_Ident *loc);
+
+// vote
+EXTERN int32_t __kmpc_warp_active_thread_mask();
+
+// tasks
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc,
+                                            uint32_t global_tid, int32_t flag,
+                                            size_t sizeOfTaskInclPrivate,
+                                            size_t sizeOfSharedTable,
+                                            kmp_TaskFctPtr sub);
+EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
+                               kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newLegacyTaskDescr,
+                                         int32_t depNum, void *depList,
+                                         int32_t noAliasDepNum,
+                                         void *noAliasDepList);
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
+                                      kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newLegacyTaskDescr);
+EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
+                                 int32_t depNum, void *depList,
+                                 int32_t noAliasDepNum, void *noAliasDepList);
+EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid);
+EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid);
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
+                                    int end_part);
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid);
+EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
+                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
+                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
+                            int32_t sched, uint64_t grainsize, void *task_dup);
+
+// cancel
+EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
+                                        int32_t cancelVal);
+EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
+                             int32_t cancelVal);
+
+// non standard
+EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
+EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
+                                    int16_t RequiresDataSharing);
+EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit();
+EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+                                           int16_t IsOMPRuntimeInitialized);
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+                                   int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_kernel_end_parallel();
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
+                                              bool *IsFinal,
+                                              int32_t *LaneSource);
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
+                                          bool *IsFinal, int32_t *LaneSource,
+                                          int32_t *LaneId, int32_t *NumLanes);
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
+
+
+EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void __kmpc_data_sharing_init_stack_spmd();
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
+    int16_t UseSharedMemory);
+EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
+EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
+EXTERN void __kmpc_end_sharing_variables();
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
+
+// The slot used for data sharing by the master and worker threads. We use a
+// complete (default size version and an incomplete one so that we allow sizes
+// greater than the default).
+struct __kmpc_data_sharing_slot {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[];
+};
+EXTERN void
+__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
+                                           size_t InitialDataSize);
+EXTERN void *__kmpc_data_sharing_environment_begin(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, int32_t *SavedActiveThreads,
+    size_t SharingDataSize, size_t SharingDefaultDataSize,
+    int16_t IsOMPRuntimeInitialized);
+EXTERN void __kmpc_data_sharing_environment_end(
+    __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
+    void **SavedSharedFrame, int32_t *SavedActiveThreads, int32_t IsEntryPoint);
+
+EXTERN void *
+__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
+                                          int16_t IsOMPRuntimeInitialized);
+
+// SPMD execution mode interrogation function.
+EXTERN int8_t __kmpc_is_spmd_exec_mode();
+
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+                                          const void *buf, size_t size,
+                                          int16_t is_shared, const void **res);
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+                                              int16_t is_shared);
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu
new file mode 100644
index 0000000..9580d75
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -0,0 +1,440 @@
+//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP runtime functions that can be
+// invoked by the user in an OpenMP region
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+// Timer precision is 1ns
+#define TIMER_PRECISION ((double)1E-9)
+
+EXTERN double omp_get_wtick(void) {
+  PRINT(LD_IO, "omp_get_wtick() returns %g\n", TIMER_PRECISION);
+  return TIMER_PRECISION;
+}
+
+EXTERN double omp_get_wtime(void) {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  double rc = (double)nsecs * TIMER_PRECISION;
+  PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_num_threads(int num) {
+  // Ignore it for SPMD mode.
+  if (isSPMDMode())
+    return;
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+  PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
+  if (num <= 0) {
+    WARNING0(LW_INPUT, "expected positive num; ignore\n");
+  } else if (parallelLevel[GetWarpId()] == 0) {
+    nThreads = num;
+  }
+}
+
+EXTERN int omp_get_num_threads(void) {
+  int rc = GetNumberOfOmpThreads(isSPMDMode());
+  PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_max_threads(void) {
+  if (parallelLevel[GetWarpId()] > 0)
+    // We're already in parallel region.
+    return 1; // default is 1 thread avail
+  // Not currently in a parallel region, return what was set.
+  int rc = 1;
+  if (parallelLevel[GetWarpId()] == 0)
+    rc = nThreads;
+  ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
+  PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_limit(void) {
+  if (isSPMDMode())
+    return GetNumberOfThreadsInBlock();
+  int rc = threadLimit;
+  PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_thread_num() {
+  bool isSPMDExecutionMode = isSPMDMode();
+  int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  int rc = GetOmpThreadId(tid, isSPMDExecutionMode);
+  PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_num_procs(void) {
+  int rc = GetNumberOfProcsInDevice(isSPMDMode());
+  PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_parallel(void) {
+  int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
+  PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_in_final(void) {
+  // treat all tasks as final... Specs may expect runtime to keep
+  // track more precisely if a task was actively set by users... This
+  // is not explicitely specified; will treat as if runtime can
+  // actively decide to put a non-final task into a final one.
+  int rc = 1;
+  PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_dynamic(int flag) {
+  PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
+}
+
+EXTERN int omp_get_dynamic(void) {
+  int rc = 0;
+  PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_nested(int flag) {
+  PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
+        flag);
+}
+
+EXTERN int omp_get_nested(void) {
+  int rc = 0;
+  PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_max_active_levels(int level) {
+  PRINT(LD_IO,
+        "call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
+        level);
+}
+
+EXTERN int omp_get_max_active_levels(void) {
+  int rc = 1;
+  PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_level(void) {
+  int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
+  PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
+  return level;
+}
+
+EXTERN int omp_get_active_level(void) {
+  int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
+  PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
+  return level;
+}
+
+EXTERN int omp_get_ancestor_thread_num(int level) {
+  if (isSPMDMode())
+    return level == 1 ? GetThreadIdInBlock() : 0;
+  int rc = -1;
+  // If level is 0 or all parallel regions are not active - return 0.
+  unsigned parLevel = parallelLevel[GetWarpId()];
+  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
+    int totLevel = omp_get_level();
+    if (level <= totLevel) {
+      omptarget_nvptx_TaskDescr *currTaskDescr =
+          getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
+      int steps = totLevel - level;
+      PRINT(LD_IO, "backtrack %d steps\n", steps);
+      ASSERT0(LT_FUSSY, currTaskDescr,
+              "do not expect fct to be called in a non-active thread");
+      do {
+        if (DON(LD_IOD)) {
+          // print current state
+          omp_sched_t sched = currTaskDescr->GetRuntimeSched();
+          PRINT(LD_ALL,
+                "task descr %s %d: %s, in par %d, rt sched %d,"
+                " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
+                "ancestor", steps,
+                (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
+                (int)currTaskDescr->InParallelRegion(), (int)sched,
+                currTaskDescr->RuntimeChunkSize(),
+                (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
+                (int)nThreads);
+        }
+
+        if (currTaskDescr->IsParallelConstruct()) {
+          // found the level
+          if (!steps) {
+            rc = currTaskDescr->ThreadId();
+            break;
+          }
+          steps--;
+        }
+        currTaskDescr = currTaskDescr->GetPrevTaskDescr();
+      } while (currTaskDescr);
+      ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
+    }
+  } else if (level == 0 ||
+             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= parLevel) ||
+             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
+    rc = 0;
+  }
+  PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
+        rc)
+  return rc;
+}
+
+EXTERN int omp_get_team_size(int level) {
+  if (isSPMDMode())
+    return level == 1 ? GetNumberOfThreadsInBlock() : 1;
+  int rc = -1;
+  unsigned parLevel = parallelLevel[GetWarpId()];
+  // If level is 0 or all parallel regions are not active - return 1.
+  if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
+    rc = threadsInTeam;
+  } else if (level == 0 ||
+             (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= parLevel) ||
+             (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
+              level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
+    rc = 1;
+  }
+  PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
+  return rc;
+}
+
+EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
+  if (isRuntimeUninitialized()) {
+    ASSERT0(LT_FUSSY, isSPMDMode(),
+            "Expected SPMD mode only with uninitialized runtime.");
+    *kind = omp_sched_static;
+    *modifier = 1;
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(isSPMDMode());
+    *kind = currTaskDescr->GetRuntimeSched();
+    *modifier = currTaskDescr->RuntimeChunkSize();
+  }
+  PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
+        (int)*kind, *modifier);
+}
+
+EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
+  PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
+        modifier);
+  if (isRuntimeUninitialized()) {
+    ASSERT0(LT_FUSSY, isSPMDMode(),
+            "Expected SPMD mode only with uninitialized runtime.");
+    return;
+  }
+  if (kind >= omp_sched_static && kind < omp_sched_auto) {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        getMyTopTaskDescriptor(isSPMDMode());
+    currTaskDescr->SetRuntimeSched(kind);
+    currTaskDescr->RuntimeChunkSize() = modifier;
+    PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
+          (int)currTaskDescr->GetRuntimeSched(),
+          currTaskDescr->RuntimeChunkSize());
+  }
+}
+
+EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
+  PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
+  return omp_proc_bind_true;
+}
+
+EXTERN int omp_get_num_places(void) {
+  PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_place_num_procs(int place_num) {
+  PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
+  return 0;
+}
+
+EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
+  PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
+}
+
+EXTERN int omp_get_place_num(void) {
+  PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_partition_num_places(void) {
+  PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
+  return 0;
+}
+
+EXTERN void omp_get_partition_place_nums(int *place_nums) {
+  PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
+}
+
+EXTERN int omp_get_cancellation(void) {
+  int rc = FALSE; // currently false only
+  PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN void omp_set_default_device(int deviceId) {
+  PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
+}
+
+EXTERN int omp_get_default_device(void) {
+  PRINT0(LD_IO,
+         "call omp_get_default_device() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_num_devices(void) {
+  PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
+  return 0;
+}
+
+EXTERN int omp_get_num_teams(void) {
+  int rc = GetNumberOfOmpTeams();
+  PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_get_team_num() {
+  int rc = GetOmpTeamId();
+  PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_is_initial_device(void) {
+  PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
+  return 0; // 0 by def on device
+}
+
+// Unspecified on the device.
+EXTERN int omp_get_initial_device(void) {
+  PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
+  return 0;
+}
+
+// Unused for now.
+EXTERN int omp_get_max_task_priority(void) {
+  PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
+  return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// locks
+////////////////////////////////////////////////////////////////////////////////
+
+#define __OMP_SPIN 1000
+#define UNSET 0
+#define SET 1
+
+EXTERN void omp_init_lock(omp_lock_t *lock) {
+  omp_unset_lock(lock);
+  PRINT0(LD_IO, "call omp_init_lock()\n");
+}
+
+EXTERN void omp_destroy_lock(omp_lock_t *lock) {
+  omp_unset_lock(lock);
+  PRINT0(LD_IO, "call omp_destroy_lock()\n");
+}
+
+EXTERN void omp_set_lock(omp_lock_t *lock) {
+  // int atomicCAS(int* address, int compare, int val);
+  // (old == compare ? val : old)
+
+  // TODO: not sure spinning is a good idea here..
+  while (atomicCAS(lock, UNSET, SET) != UNSET) {
+    clock_t start = clock();
+    clock_t now;
+    for (;;) {
+      now = clock();
+      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= __OMP_SPIN * blockIdx.x) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+
+  PRINT0(LD_IO, "call omp_set_lock()\n");
+}
+
+EXTERN void omp_unset_lock(omp_lock_t *lock) {
+  (void)atomicExch(lock, UNSET);
+
+  PRINT0(LD_IO, "call omp_unset_lock()\n");
+}
+
+EXTERN int omp_test_lock(omp_lock_t *lock) {
+  // int atomicCAS(int* address, int compare, int val);
+  // (old == compare ? val : old)
+  int ret = atomicAdd(lock, 0);
+
+  PRINT(LD_IO, "call omp_test_lock() return %d\n", ret);
+
+  return ret;
+}
+
+// for xlf Fotran
+// Fotran, the return is LOGICAL type
+
+#define FLOGICAL long
+EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
+  int ret = omp_is_initial_device();
+  if (ret == 0)
+    return (FLOGICAL)0;
+  else
+    return (FLOGICAL)1;
+}
+
+EXTERN int __xlf_omp_is_initial_device_i4() {
+  int ret = omp_is_initial_device();
+  if (ret == 0)
+    return 0;
+  else
+    return 1;
+}
+
+EXTERN long __xlf_omp_get_team_num_i4() {
+  int ret = omp_get_team_num();
+  return (long)ret;
+}
+
+EXTERN long __xlf_omp_get_num_teams_i4() {
+  int ret = omp_get_num_teams();
+  return (long)ret;
+}
+
+EXTERN void xlf_debug_print_int(int *p) {
+  printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_long(long *p) {
+  printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_float(float *p) {
+  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_double(double *p) {
+  printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
+}
+
+EXTERN void xlf_debug_print_addr(void *p) {
+  printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/loop.cu b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu
new file mode 100644
index 0000000..c255137
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -0,0 +1,807 @@
+//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+// template class that encapsulate all the helper functions
+//
+// T is loop iteration type (32 | 64)  (unsigned | signed)
+// ST is the signed version of T
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
+public:
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling with chunk
+
+  // Generic implementation of OMP loop scheduling with static policy
+  /*! \brief Calculate initial bounds for static loop and stride
+   *  @param[in] loc location in code of the call (not used here)
+   *  @param[in] global_tid global thread id
+   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+   *  @param[in] plastiter pointer to last iteration
+   *  @param[in,out] pointer to loop lower bound. it will contain value of
+   *  lower bound of first chunk
+   *  @param[in,out] pointer to loop upper bound. It will contain value of
+   *  upper bound of first chunk
+   *  @param[in,out] pointer to loop stride. It will contain value of stride
+   *  between two successive chunks executed by the same thread
+   *  @param[in] loop increment bump
+   *  @param[in] chunk size
+   */
+
+  // helper function for static chunk
+  INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
+                                    ST chunk, T entityId, T numberOfEntities) {
+    // each thread executes multiple chunks all of the same size, except
+    // the last one
+
+    // distance between two successive chunks
+    stride = numberOfEntities * chunk;
+    lb = lb + entityId * chunk;
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    // Say ub' is the begining of the last chunk. Then who ever has a
+    // lower bound plus a multiple of the increment equal to ub' is
+    // the last one.
+    T beginingLastChunk = inputUb - (inputUb % chunk);
+    last = ((beginingLastChunk - lb) % stride) == 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling without chunk
+
+  // helper function for static no chunk
+  INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
+                                      ST &chunk, T entityId,
+                                      T numberOfEntities) {
+    // No chunk size specified.  Each thread or warp gets at most one
+    // chunk; chunks are all almost of equal size
+    T loopSize = ub - lb + 1;
+
+    chunk = loopSize / numberOfEntities;
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for Static Init
+
+  INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
+                                     int32_t *plastiter, T *plower, T *pupper,
+                                     ST *pstride, ST chunk,
+                                     bool IsSPMDExecutionMode) {
+    // When IsRuntimeUninitialized is true, we assume that the caller is
+    // in an L0 parallel region and that all worker threads participate.
+
+    // Assume we are in teams region or that we use a single block
+    // per target region
+    ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
+
+    // All warps that are in excess of the maximum requested, do
+    // not execute the loop
+    PRINT(LD_LOOP,
+          "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+          "%d, num tids %d\n",
+          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
+          (int)numberOfActiveOMPThreads);
+    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
+            "current thread is not needed here; error");
+
+    // copy
+    int lastiter = 0;
+    T lb = *plower;
+    T ub = *pupper;
+    ST stride = *pstride;
+    // init
+    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
+    case kmp_sched_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_balanced_chunk: {
+      if (chunk > 0) {
+        // round up to make sure the chunk is enough to cover all iterations
+        T tripCount = ub - lb + 1; // +1 because ub is inclusive
+        T span = (tripCount + numberOfActiveOMPThreads - 1) /
+                 numberOfActiveOMPThreads;
+        // perform chunk adjustment
+        chunk = (span + chunk - 1) & ~(chunk - 1);
+
+        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+        T oldUb = ub;
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        if (ub > oldUb)
+          ub = oldUb;
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+                       GetNumberOfOmpTeams());
+        break;
+      } // note: if chunk <=0, use nochunk
+    }
+    case kmp_sched_distr_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
+                       GetNumberOfOmpTeams());
+      break;
+    }
+    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
+                     GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
+      break;
+    }
+    default: {
+      ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype);
+      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
+            (int)schedtype);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                     numberOfActiveOMPThreads);
+      break;
+    }
+    }
+    // copy back
+    *plastiter = lastiter;
+    *plower = lb;
+    *pupper = ub;
+    *pstride = stride;
+    PRINT(LD_LOOP,
+          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+          "%d\n",
+          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
+          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
+          (int)lastiter);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch Init
+
+  INLINE static int OrderedSchedule(kmp_sched_t schedule) {
+    return schedule >= kmp_sched_ordered_first &&
+           schedule <= kmp_sched_ordered_last;
+  }
+
+  INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
+                                   kmp_sched_t schedule, T lb, T ub, ST st,
+                                   ST chunk) {
+    if (checkRuntimeUninitialized(loc)) {
+      // In SPMD mode no need to check parallelism level - dynamic scheduling
+      // may appear only in L2 parallel regions with lightweight runtime.
+      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
+      return;
+    }
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+    omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
+    T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
+    T tripCount = ub - lb + 1; // +1 because ub is inclusive
+    ASSERT0(LT_FUSSY, threadId < tnum,
+            "current thread is not needed here; error");
+
+    /* Currently just ignore the monotonic and non-monotonic modifiers
+     * (the compiler isn't producing them * yet anyway).
+     * When it is we'll want to look at them somewhere here and use that
+     * information to add to our schedule choice. We shouldn't need to pass
+     * them on, they merely affect which schedule we can legally choose for
+     * various dynamic cases. (In paritcular, whether or not a stealing scheme
+     * is legal).
+     */
+    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+    // Process schedule.
+    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
+            (long)tnum, (long long)tripCount, (int)schedule);
+      schedule = kmp_sched_static_chunk;
+      chunk = tripCount; // one thread gets the whole loop
+    } else if (schedule == kmp_sched_runtime) {
+      // process runtime
+      omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
+      chunk = currTaskDescr->RuntimeChunkSize();
+      switch (rtSched) {
+      case omp_sched_static: {
+        if (chunk > 0)
+          schedule = kmp_sched_static_chunk;
+        else
+          schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto: {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic:
+      case omp_sched_guided: {
+        schedule = kmp_sched_dynamic;
+        break;
+      }
+      }
+      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else if (schedule == kmp_sched_auto) {
+      schedule = kmp_sched_static_chunk;
+      chunk = 1;
+      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else {
+      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+      ASSERT(LT_FUSSY,
+             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+             "unknown schedule %d & chunk %lld\n", (int)schedule,
+             (long long)chunk);
+    }
+
+    // init schedules
+    if (schedule == kmp_sched_static_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_static_balanced_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      // round up to make sure the chunk is enough to cover all iterations
+      T span = (tripCount + tnum - 1) / tnum;
+      // perform chunk adjustment
+      chunk = (span + chunk - 1) & ~(chunk - 1);
+
+      T oldUb = ub;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+      if (ub > oldUb)
+        ub = oldUb;
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_static_nochunk) {
+      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      // save ub
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+      // save data
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      if (chunk < 1)
+        chunk = 1;
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+        __threadfence_block();
+      }
+      __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
+            (int)tnum,
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->Chunk(tid));
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch next
+
+  INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
+    int lo, hi;
+    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+    hi = __SHFL_SYNC(active, hi, leader);
+    lo = __SHFL_SYNC(active, lo, leader);
+    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+    return val;
+  }
+
+  INLINE static uint64_t NextIter() {
+    unsigned int active = __ACTIVEMASK();
+    int leader = __ffs(active) - 1;
+    int change = __popc(active);
+    unsigned lane_mask_lt;
+    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
+    unsigned int rank = __popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomicAdd(
+          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+          change);
+    }
+    warp_res = Shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
+  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+                                     T loopLowerBound, T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
+    ub = lb + chunkSize - 1;  // Clang uses i <= ub
+
+    // 3 result cases:
+    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
+    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+    //  NOT_FINISHED
+    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+    // a.
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      return NOT_FINISHED;
+    }
+    // b.
+    if (lb <= loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      ub = loopUpperBound;
+      return LAST_CHUNK;
+    }
+    // c. if we are here, we are in case 'c'
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
+          (long long)ub, (long long)loopUpperBound);
+    return FINISHED;
+  }
+
+  INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
+                                  T *plower, T *pupper, ST *pstride) {
+    if (checkRuntimeUninitialized(loc)) {
+      // In SPMD mode no need to check parallelism level - dynamic scheduling
+      // may appear only in L2 parallel regions with lightweight runtime.
+      ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode.");
+      if (*plast)
+        return DISPATCH_FINISHED;
+      *plast = 1;
+      return DISPATCH_NOTFINISHED;
+    }
+    // ID of a thread in its own warp
+
+    // automatically selects thread or warp ID based on selected implementation
+    int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+    ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
+            "current thread is not needed here; error");
+    // retrieve schedule
+    kmp_sched_t schedule =
+        omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
+
+    // xxx reduce to one
+    if (schedule == kmp_sched_static_chunk ||
+        schedule == kmp_sched_static_nochunk) {
+      T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
+      T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
+      // finished?
+      if (myLb > ub) {
+        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+              (long long)myLb, (long long)ub);
+        return DISPATCH_FINISHED;
+      }
+      // not finished, save current bounds
+      ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
+      *plower = myLb;
+      T myUb = myLb + chunk - 1; // Clang uses i <= ub
+      if (myUb > ub)
+        myUb = ub;
+      *pupper = myUb;
+      *plast = (int32_t)(myUb == ub);
+
+      // increment next lower bound by the stride
+      ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
+      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+            (long long)*plower, (long long)*pupper);
+      return DISPATCH_NOTFINISHED;
+    }
+    ASSERT0(LT_FUSSY,
+            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+            "bad sched");
+    T myLb, myUb;
+    int finished = DynamicNextChunk(
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+
+    if (finished == FINISHED)
+      return DISPATCH_FINISHED;
+
+    // not finished (either not finished or last chunk)
+    *plast = (int32_t)(finished == LAST_CHUNK);
+    *plower = myLb;
+    *pupper = myUb;
+    *pstride = 1;
+
+    PRINT(LD_LOOP,
+          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+          "last %d\n",
+          (int)GetNumberOfOmpThreads(isSPMDMode()),
+          (int)GetNumberOfWorkersInTeam(), (long long)*plower,
+          (long long)*pupper, (long long)*pstride, (int)*plast);
+    return DISPATCH_NOTFINISHED;
+  }
+
+  INLINE static void dispatch_fini() {
+    // nothing
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // end of template class that encapsulate all the helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// init
+EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
+                                   int32_t schedule, int32_t lb, int32_t ub,
+                                   int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
+                                    int32_t schedule, uint32_t lb, uint32_t ub,
+                                    int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
+                                   int32_t schedule, int64_t lb, int64_t ub,
+                                   int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
+                                    int32_t schedule, uint64_t lb, uint64_t ub,
+                                    int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
+}
+
+// next
+EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
+                                  int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
+                                   int32_t *p_last, uint32_t *p_lb,
+                                   uint32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
+                                  int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
+                                   int32_t *p_last, uint64_t *p_lb,
+                                   uint64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st);
+}
+
+// fini
+EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+}
+
+EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int32_t *plower, int32_t *pupper,
+                                     int32_t *pstride, int32_t incr,
+                                     int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint32_t *plower, uint32_t *pupper,
+                                      int32_t *pstride, int32_t incr,
+                                      int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
+                                     int32_t schedtype, int32_t *plastiter,
+                                     int64_t *plower, int64_t *pupper,
+                                     int64_t *pstride, int64_t incr,
+                                     int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
+                                      int32_t schedtype, int32_t *plastiter,
+                                      uint64_t *plower, uint64_t *pupper,
+                                      int64_t *pstride, int64_t incr,
+                                      int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      checkSPMDMode(loc));
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint32_t *plower,
+                                           uint32_t *pupper, int32_t *pstride,
+                                           int32_t incr, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint64_t *plower,
+                                           uint64_t *pupper, int64_t *pstride,
+                                           int64_t incr, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+EXTERN
+void __kmpc_for_static_init_4_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN
+void __kmpc_for_static_init_4u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN
+void __kmpc_for_static_init_8_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN
+void __kmpc_for_static_init_8u_simple_generic(
+    kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+
+namespace {
+INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
+  int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp (at least, all present for the kernel launch) participate in the
+  // barrier.  This is enforced when launching the parallel region.  An
+  // exception is when there are < WARPSIZE workers.  In this case only 1 worker
+  // is started, so we don't need a barrier.
+  if (NumThreads > 1) {
+#endif
+    named_sync(L1_BARRIER, WARPSIZE * NumWarps);
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  }
+#endif
+}
+}; // namespace
+
+EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
+                                                  int32_t varNum, void *array) {
+  PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Expected non-SPMD mode + initialized runtime.");
+
+  omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
+  uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
+  uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
+  for (unsigned i = 0; i < varNum; i++) {
+    // Reset buffer.
+    if (gtid == 0)
+      *Buffer = 0; // Reset to minimum loop iteration value.
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Atomic max of iterations.
+    uint64_t *varArray = (uint64_t *)array;
+    uint64_t elem = varArray[i];
+    (void)atomicMax((unsigned long long int *)Buffer,
+                    (unsigned long long int)elem);
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+
+    // Read max value and update thread private array.
+    varArray[i] = *Buffer;
+
+    // Barrier.
+    syncWorkersInGenericMode(NumThreads);
+  }
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
new file mode 100644
index 0000000..d369da1
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -0,0 +1,67 @@
+//===------------ omp_data.cu - NVPTX OpenMP GPU objects --------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the data objects used on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global device envrionment
+////////////////////////////////////////////////////////////////////////////////
+
+__device__ omptarget_device_environmentTy omptarget_device_environment;
+
+////////////////////////////////////////////////////////////////////////////////
+// global data holding OpenMP state information
+////////////////////////////////////////////////////////////////////////////////
+
+__device__
+    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+        omptarget_nvptx_device_State[MAX_SM];
+
+__device__ omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+__device__ __shared__ uint32_t usedMemIdx;
+__device__ __shared__ uint32_t usedSlotIdx;
+
+__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+__device__ __shared__ uint16_t threadLimit;
+__device__ __shared__ uint16_t threadsInTeam;
+__device__ __shared__ uint16_t nThreads;
+// Pointer to this team's OpenMP state object
+__device__ __shared__
+    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+////////////////////////////////////////////////////////////////////////////////
+// The team master sets the outlined parallel function in this variable to
+// communicate with the workers.  Since it is in shared memory, there is one
+// copy of these variables for each kernel, instance, and team.
+////////////////////////////////////////////////////////////////////////////////
+volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenMP kernel execution parameters
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ uint32_t execution_param;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing state
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// Scratchpad for teams reduction.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing related variables.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
new file mode 100644
index 0000000..706776a
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -0,0 +1,186 @@
+//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the initialization code for the GPU
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__
+    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+        omptarget_nvptx_device_State[MAX_SM];
+
+////////////////////////////////////////////////////////////////////////////////
+// init entry points
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE static unsigned smid() {
+  unsigned id;
+  asm("mov.u32 %0, %%smid;" : "=r"(id));
+  return id;
+}
+
+EXTERN void __kmpc_kernel_init_params(void *Ptr) {
+  PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
+        OMPTARGET_NVPTX_VERSION);
+
+  SetTeamsReductionScratchpadPtr(Ptr);
+}
+
+EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
+  PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
+        OMPTARGET_NVPTX_VERSION);
+  ASSERT0(LT_FUSSY, RequiresOMPRuntime,
+          "Generic always requires initialized runtime.");
+  setExecutionParameters(Generic, RuntimeInitialized);
+  for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I)
+    parallelLevel[I] = 0;
+
+  int threadIdInBlock = GetThreadIdInBlock();
+  ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
+          "__kmpc_kernel_init() must be called by team master warp only!");
+  PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
+
+  // Get a state object from the queue.
+  int slot = smid() % MAX_SM;
+  usedSlotIdx = slot;
+  omptarget_nvptx_threadPrivateContext =
+      omptarget_nvptx_device_State[slot].Dequeue();
+
+  // init thread private
+  int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false);
+  omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
+
+  // init team context
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  currTeamDescr.InitTeamDescr();
+  // this thread will start execution... has to update its task ICV
+  // to point to the level zero task ICV. That ICV was init in
+  // InitTeamDescr()
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTeamDescr.LevelZeroTaskDescr());
+
+  // set number of threads and thread limit in team to started value
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+  nThreads = GetNumberOfWorkersInTeam();
+  threadLimit = ThreadLimit;
+}
+
+EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
+  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
+          "Generic always requires initialized runtime.");
+  // Enqueue omp state object for use by another team.
+  int slot = usedSlotIdx;
+  omptarget_nvptx_device_State[slot].Enqueue(
+      omptarget_nvptx_threadPrivateContext);
+  // Done with work.  Kill the workers.
+  omptarget_nvptx_workFn = 0;
+}
+
+EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
+                                    int16_t RequiresDataSharing) {
+  PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
+
+  setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized
+                                                  : RuntimeUninitialized);
+  int threadId = GetThreadIdInBlock();
+  if (threadId == 0) {
+    usedSlotIdx = smid() % MAX_SM;
+    parallelLevel[0] =
+        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+  } else if (GetLaneId() == 0) {
+    parallelLevel[GetWarpId()] =
+        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+  }
+  if (!RequiresOMPRuntime) {
+    // Runtime is not required - exit.
+    __SYNCTHREADS();
+    return;
+  }
+
+  //
+  // Team Context Initialization.
+  //
+  // In SPMD mode there is no master thread so use any cuda thread for team
+  // context initialization.
+  if (threadId == 0) {
+    // Get a state object from the queue.
+    omptarget_nvptx_threadPrivateContext =
+        omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
+
+    omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+    omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+    // init team context
+    currTeamDescr.InitTeamDescr();
+  }
+  // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
+  __SYNCTHREADS();
+
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+
+  //
+  // Initialize task descr for each thread.
+  //
+  omptarget_nvptx_TaskDescr *newTaskDescr =
+      omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+  ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+  newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
+  // install new top descriptor
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                             newTaskDescr);
+
+  // init thread private from init value
+  PRINT(LD_PAR,
+        "thread will execute parallel region with id %d in a team of "
+        "%d threads\n",
+        (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
+
+  if (RequiresDataSharing && GetLaneId() == 0) {
+    // Warp master innitializes data sharing environment.
+    unsigned WID = threadId / WARPSIZE;
+    __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(
+        WID, WID == WARPSIZE - 1);
+    DataSharingState.SlotPtr[WID] = RootS;
+    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  }
+}
+
+EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() {
+  __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized());
+}
+
+EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
+  // We're not going to pop the task descr stack of each thread since
+  // there are no more parallel regions in SPMD mode.
+  if (!RequiresOMPRuntime)
+    return;
+
+  // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
+  __SYNCTHREADS();
+  int threadId = GetThreadIdInBlock();
+  if (threadId == 0) {
+    // Enqueue omp state object for use by another team.
+    int slot = usedSlotIdx;
+    omptarget_nvptx_device_State[slot].Enqueue(
+        omptarget_nvptx_threadPrivateContext);
+  }
+}
+
+// Return true if the current target region is executed in SPMD mode.
+EXTERN int8_t __kmpc_is_spmd_exec_mode() {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");
+  return isSPMDMode();
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
new file mode 100644
index 0000000..f28284d
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -0,0 +1,445 @@
+//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPTARGET_NVPTX_H
+#define __OMPTARGET_NVPTX_H
+
+// std includes
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <inttypes.h>
+
+// cuda includes
+#include <cuda.h>
+#include <math.h>
+
+// local includes
+#include "debug.h"     // debug
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "option.h"    // choices we have
+#include "state-queue.h"
+#include "support.h"
+
+#define OMPTARGET_NVPTX_VERSION 1.1
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#define BARRIER_COUNTER 0
+#define ORDERED_COUNTER 1
+
+// Macros for Cuda intrinsics
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
+#ifndef CUDA_VERSION
+#error CUDA_VERSION macro is undefined, something wrong with cuda.
+#elif CUDA_VERSION >= 9000
+#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
+  __shfl_down_sync((mask), (var), (delta), (width))
+#define __ACTIVEMASK() __activemask()
+#else
+#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
+#define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
+  __shfl_down((var), (delta), (width))
+#define __ACTIVEMASK() __ballot(1)
+#endif // CUDA_VERSION
+
+#define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory");
+// Use original __syncthreads if compiled by nvcc or clang >= 9.0.
+#if !defined(__clang__) || __clang_major__ >= 9
+#define __SYNCTHREADS() __syncthreads()
+#else
+#define __SYNCTHREADS() __SYNCTHREADS_N(0)
+#endif
+
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+  // All these methods must be called by the master thread only.
+  INLINE void Init() {
+    args  = buffer;
+    nArgs = MAX_SHARED_ARGS;
+  }
+  INLINE void DeInit() {
+    // Free any memory allocated for outlined parallel function with a large
+    // number of arguments.
+    if (nArgs > MAX_SHARED_ARGS) {
+      SafeFree(args, (char *)"new extended args");
+      Init();
+    }
+  }
+  INLINE void EnsureSize(size_t size) {
+    if (size > nArgs) {
+      if (nArgs > MAX_SHARED_ARGS) {
+        SafeFree(args, (char *)"new extended args");
+      }
+      args = (void **) SafeMalloc(size * sizeof(void *),
+                                  (char *)"new extended args");
+      nArgs = size;
+    }
+  }
+  // Called by all threads.
+  INLINE void **GetArgs() const { return args; };
+private:
+  // buffer of pre-allocated arguments.
+  void *buffer[MAX_SHARED_ARGS];
+  // pointer to arguments buffer.
+  // starts off as a pointer to 'buffer' but can be dynamically allocated.
+  void **args;
+  // starts off as MAX_SHARED_ARGS but can increase in size.
+  uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs
+    omptarget_nvptx_globalArgs;
+
+// Data sharing related quantities, need to match what is used in the compiler.
+enum DATA_SHARING_SIZES {
+  // The maximum number of workers in a kernel.
+  DS_Max_Worker_Threads = 992,
+  // The size reserved for data in a shared memory slot.
+  DS_Slot_Size = 256,
+  // The slot size that should be reserved for a working warp.
+  DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
+  // The maximum number of warps in use
+  DS_Max_Warp_Number = 32,
+  // The size of the preallocated shared memory buffer per team
+  DS_Shared_Memory_Size = 128,
+};
+
+// Data structure to keep in shared memory that traces the current slot, stack,
+// and frame pointer as well as the active threads that didn't exit the current
+// environment.
+struct DataSharingStateTy {
+  __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
+  void *StackPtr[DS_Max_Warp_Number];
+  void * volatile FramePtr[DS_Max_Warp_Number];
+  int32_t ActiveThreads[DS_Max_Warp_Number];
+};
+// Additional worker slot type which is initialized with the default worker slot
+// size of 4*32 bytes.
+struct __kmpc_data_sharing_worker_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Worker_Warp_Slot_Size];
+};
+// Additional master slot type which is initialized with the default master slot
+// size of 4 bytes.
+struct __kmpc_data_sharing_master_slot_static {
+  __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
+  void *DataEnd;
+  char Data[DS_Slot_Size];
+};
+extern __device__ __shared__ DataSharingStateTy DataSharingState;
+
+////////////////////////////////////////////////////////////////////////////////
+// task ICV and (implicit & explicit) task state
+
+class omptarget_nvptx_TaskDescr {
+public:
+  // methods for flags
+  INLINE omp_sched_t GetRuntimeSched() const;
+  INLINE void SetRuntimeSched(omp_sched_t sched);
+  INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
+  INLINE int InL2OrHigherParallelRegion() const {
+    return items.flags & TaskDescr_InParL2P;
+  }
+  INLINE int IsParallelConstruct() const {
+    return items.flags & TaskDescr_IsParConstr;
+  }
+  INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
+  // methods for other fields
+  INLINE uint16_t &ThreadId() { return items.threadId; }
+  INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
+  INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
+  INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
+    prev = taskDescr;
+  }
+  // init & copy
+  INLINE void InitLevelZeroTaskDescr();
+  INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
+  INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
+  INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
+  INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
+  INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
+                                   uint16_t tid, uint16_t tnum);
+  INLINE void SaveLoopData();
+  INLINE void RestoreLoopData() const;
+
+private:
+  // bits for flags: (6 used, 2 free)
+  //   3 bits (SchedMask) for runtime schedule
+  //   1 bit (InPar) if this thread has encountered one or more parallel region
+  //   1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
+  //   1 bit (InParL2+) if this thread has encountered L2 or higher parallel
+  //   region
+  static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
+  static const uint8_t TaskDescr_InPar = 0x10;
+  static const uint8_t TaskDescr_IsParConstr = 0x20;
+  static const uint8_t TaskDescr_InParL2P = 0x40;
+
+  struct SavedLoopDescr_items {
+    int64_t loopUpperBound;
+    int64_t nextLowerBound;
+    int64_t chunk;
+    int64_t stride;
+    kmp_sched_t schedule;
+  } loopData;
+
+  struct TaskDescr_items {
+    uint8_t flags; // 6 bit used (see flag above)
+    uint8_t unused;
+    uint16_t threadId;         // thread id
+    uint64_t runtimeChunkSize; // runtime chunk size
+  } items;
+  omptarget_nvptx_TaskDescr *prev;
+};
+
+// build on kmp
+typedef struct omptarget_nvptx_ExplicitTaskDescr {
+  omptarget_nvptx_TaskDescr
+      taskDescr; // omptarget_nvptx task description (must be first)
+  kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
+} omptarget_nvptx_ExplicitTaskDescr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Descriptor of a parallel region (worksharing in general)
+
+class omptarget_nvptx_WorkDescr {
+
+public:
+  // access to data
+  INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
+
+private:
+  omptarget_nvptx_TaskDescr masterTaskICV;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+class omptarget_nvptx_TeamDescr {
+public:
+  // access to data
+  INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
+    return &levelZeroTaskDescr;
+  }
+  INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
+    return workDescrForActiveParallel;
+  }
+  INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
+
+  // init
+  INLINE void InitTeamDescr();
+
+  INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
+    // If this is invoked by the master thread of the master warp then intialize
+    // it with a smaller slot.
+    if (IsMasterThread) {
+      // Do not initalize this slot again if it has already been initalized.
+      if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size)
+        return 0;
+      // Initialize the pointer to the end of the slot given the size of the
+      // data section. DataEnd is non-inclusive.
+      master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
+      // We currently do not have a next slot.
+      master_rootS[0].Next = 0;
+      master_rootS[0].Prev = 0;
+      master_rootS[0].PrevSlotStackPtr = 0;
+      return (__kmpc_data_sharing_slot *)&master_rootS[0];
+    }
+    // Do not initalize this slot again if it has already been initalized.
+    if (worker_rootS[wid].DataEnd ==
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size)
+      return 0;
+    // Initialize the pointer to the end of the slot given the size of the data
+    // section. DataEnd is non-inclusive.
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
+  INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
+    worker_rootS[wid].DataEnd =
+        &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
+    // We currently do not have a next slot.
+    worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
+    return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
+  }
+
+private:
+  omptarget_nvptx_TaskDescr
+      levelZeroTaskDescr; // icv for team master initial thread
+  omptarget_nvptx_WorkDescr
+      workDescrForActiveParallel; // one, ONLY for the active par
+  uint64_t lastprivateIterBuffer;
+
+  __align__(16)
+      __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
+  __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// thread private data (struct of arrays for better coalescing)
+// tid refers here to the global thread id
+// do not support multiple concurrent kernel a this time
+class omptarget_nvptx_ThreadPrivateContext {
+public:
+  // task
+  INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
+    return &levelOneTaskDescr[tid];
+  }
+  INLINE void SetTopLevelTaskDescr(int tid,
+                                   omptarget_nvptx_TaskDescr *taskICV) {
+    topTaskDescr[tid] = taskICV;
+  }
+  INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
+  // parallel
+  INLINE uint16_t &NumThreadsForNextParallel(int tid) {
+    return nextRegion.tnum[tid];
+  }
+  // simd
+  INLINE uint16_t &SimdLimitForNextSimd(int tid) {
+    return nextRegion.slim[tid];
+  }
+  // schedule (for dispatch)
+  INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
+  INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
+  INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
+  INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
+  INLINE int64_t &Stride(int tid) { return stride[tid]; }
+
+  INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
+
+  INLINE void InitThreadPrivateContext(int tid);
+  INLINE uint64_t &Cnt() { return cnt; }
+
+private:
+  // team context for this team
+  omptarget_nvptx_TeamDescr teamContext;
+  // task ICV for implict threads in the only parallel region
+  omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
+  // pointer where to find the current task ICV (top of the stack)
+  omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
+  union {
+    // Only one of the two is live at the same time.
+    // parallel
+    uint16_t tnum[MAX_THREADS_PER_TEAM];
+    // simd limit
+    uint16_t slim[MAX_THREADS_PER_TEAM];
+  } nextRegion;
+  // schedule (for dispatch)
+  kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
+  int64_t chunk[MAX_THREADS_PER_TEAM];
+  int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
+  // state for dispatch with dyn/guided OR static (never use both at a time)
+  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
+  int64_t stride[MAX_THREADS_PER_TEAM];
+  uint64_t cnt;
+};
+
+/// Device envrionment data
+struct omptarget_device_environmentTy {
+  int32_t debug_level;
+};
+
+/// Memory manager for statically allocated memory.
+class omptarget_nvptx_SimpleMemoryManager {
+private:
+  __align__(128) struct MemDataTy {
+    volatile unsigned keys[OMP_STATE_COUNT];
+  } MemData[MAX_SM];
+
+  INLINE static uint32_t hash(unsigned key) {
+    return key & (OMP_STATE_COUNT - 1);
+  }
+
+public:
+  INLINE void Release();
+  INLINE const void *Acquire(const void *buf, size_t size);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// global device envrionment
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__ omptarget_device_environmentTy omptarget_device_environment;
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// global data tables
+////////////////////////////////////////////////////////////////////////////////
+
+extern __device__ omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+extern __device__ __shared__ uint32_t usedMemIdx;
+extern __device__ __shared__ uint32_t usedSlotIdx;
+extern __device__ __shared__ uint8_t
+    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+extern __device__ __shared__ uint16_t threadLimit;
+extern __device__ __shared__ uint16_t threadsInTeam;
+extern __device__ __shared__ uint16_t nThreads;
+extern __device__ __shared__
+    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+
+extern __device__ __shared__ uint32_t execution_param;
+extern __device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// work function (outlined parallel/simd functions) and arguments.
+// needed for L1 parallelism only.
+////////////////////////////////////////////////////////////////////////////////
+
+typedef void *omptarget_nvptx_WorkFn;
+extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
+    omptarget_nvptx_workFn;
+
+////////////////////////////////////////////////////////////////////////////////
+// get private data structures
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode);
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
+
+////////////////////////////////////////////////////////////////////////////////
+// inlined implementation
+////////////////////////////////////////////////////////////////////////////////
+
+#include "omptarget-nvptxi.h"
+#include "supporti.h"
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
new file mode 100644
index 0000000..e4efa18
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -0,0 +1,226 @@
+//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of all library macros, types,
+// and functions.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Task Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
+  // sched starts from 1..4; encode it as 0..3; so add 1 here
+  uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
+  return (omp_sched_t)rc;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
+  // sched starts from 1..4; encode it as 0..3; so sub 1 here
+  uint8_t val = ((uint8_t)sched) - 1;
+  // clear current sched
+  items.flags &= ~TaskDescr_SchedMask;
+  // set new sched
+  items.flags |= val;
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   not in parallel
+
+  items.flags = 0;
+  items.threadId = 0;         // is master
+  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+}
+
+// This is called when all threads are started together in SPMD mode.
+// OMP directives include target parallel, target distribute parallel for, etc.
+INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  // slow method
+  // flag:
+  //   default sched is static,
+  //   dyn is off (unused now anyway, but may need to sample from host ?)
+  //   in L1 parallel
+
+  items.flags =
+      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+  items.threadId =
+      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyData(
+    omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  items = sourceTaskDescr->items;
+}
+
+INLINE void
+omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
+  CopyData(sourceTaskDescr);
+  prev = sourceTaskDescr->prev;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyData(parentTaskDescr);
+  prev = parentTaskDescr;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
+    omptarget_nvptx_TaskDescr *parentTaskDescr) {
+  CopyParent(parentTaskDescr);
+  items.flags = items.flags & ~TaskDescr_IsParConstr;
+  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
+    omptarget_nvptx_TaskDescr *masterTaskDescr) {
+  CopyParent(masterTaskDescr);
+  // overrwrite specific items;
+  items.flags |=
+      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
+    omptarget_nvptx_TaskDescr *workTaskDescr) {
+  Copy(workTaskDescr);
+  //
+  // overrwrite specific items;
+  //
+  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
+  // This is so that the serial master (first lane in the master warp)
+  // gets a threadId of 0.
+  // However, we know that this function is always called in a parallel
+  // region where only workers are active.  The serial master thread
+  // never enters this region.  When a parallel region is executed serially,
+  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
+  // are called, which never activate this region.
+  items.threadId =
+      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
+}
+
+INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
+    omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
+  CopyParent(parentTaskDescr);
+  items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
+  items.threadId = tid;
+}
+
+INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
+  loopData.loopUpperBound =
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
+  loopData.nextLowerBound =
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
+  loopData.schedule =
+      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
+  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
+  loopData.stride =
+      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
+}
+
+INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
+  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
+  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
+      loopData.loopUpperBound;
+  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
+      loopData.nextLowerBound;
+  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
+      loopData.stride;
+  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
+      loopData.schedule;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread Private Context
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE omptarget_nvptx_TaskDescr *
+omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
+  ASSERT0(
+      LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
+      "Getting top level, tid is larger than allocated data structure size");
+  return topTaskDescr[tid];
+}
+
+INLINE void
+omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
+  // levelOneTaskDescr is init when starting the parallel region
+  // top task descr is NULL (team master version will be fixed separately)
+  topTaskDescr[tid] = NULL;
+  // no num threads value has been pushed
+  nextRegion.tnum[tid] = 0;
+  // the following don't need to be init here; they are init when using dyn
+  // sched
+  // current_Event, events_Number, chunk, num_Iterations, schedule
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team Descriptor
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
+  levelZeroTaskDescr.InitLevelZeroTaskDescr();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Get private data structure for thread
+////////////////////////////////////////////////////////////////////////////////
+
+// Utility routines for CUDA threads
+INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
+  return omptarget_nvptx_threadPrivateContext->TeamContext();
+}
+
+INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
+  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
+  return currTeamDescr.WorkDescr();
+}
+
+INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
+  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+}
+
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
+  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory management runtime functions.
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
+          "MemIdx is too big or uninitialized.");
+  MemDataTy &MD = MemData[usedSlotIdx];
+  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
+}
+
+INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
+                                                                size_t size) {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  const unsigned sm = usedSlotIdx;
+  MemDataTy &MD = MemData[sm];
+  unsigned i = hash(GetBlockIdInKernel());
+  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+    i = hash(i + 1);
+  }
+  usedSlotIdx = sm;
+  usedMemIdx = i;
+  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/option.h b/final/libomptarget/deviceRTLs/nvptx/src/option.h
new file mode 100644
index 0000000..b3661d5
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -0,0 +1,67 @@
+//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// GPU default options
+//
+//===----------------------------------------------------------------------===//
+#ifndef _OPTION_H_
+#define _OPTION_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// Kernel options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// The following def must match the absolute limit hardwired in the host RTL
+// max number of threads per team
+#define MAX_THREADS_PER_TEAM 1024
+
+#define WARPSIZE 32
+
+// The named barrier for active parallel threads of a team in an L1 parallel
+// region to synchronize with each other.
+#define L1_BARRIER (1)
+
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
+// Maximum number of omp state objects per SM allocated statically in global
+// memory.
+#if __CUDA_ARCH__ >= 700
+#define OMP_STATE_COUNT 32
+#define MAX_SM 84
+#elif __CUDA_ARCH__ >= 600
+#define OMP_STATE_COUNT 32
+#define MAX_SM 56
+#else
+#define OMP_STATE_COUNT 16
+#define MAX_SM 16
+#endif
+
+#define OMP_ACTIVE_PARALLEL_LEVEL 128
+
+////////////////////////////////////////////////////////////////////////////////
+// algo options
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// misc options (by def everythig here is device)
+////////////////////////////////////////////////////////////////////////////////
+
+#define EXTERN extern "C" __device__
+#define INLINE __inline__ __device__
+#define NOINLINE __noinline__ __device__
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu
new file mode 100644
index 0000000..6747235
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -0,0 +1,450 @@
+//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implemention in the GPU. Here is the pattern:
+//
+//    while (not finished) {
+//
+//    if (master) {
+//      sequential code, decide which par loop to do, or if finished
+//     __kmpc_kernel_prepare_parallel() // exec by master only
+//    }
+//    syncthreads // A
+//    __kmpc_kernel_parallel() // exec by all
+//    if (this thread is included in the parallel) {
+//      switch () for all parallel loops
+//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
+//    }
+//
+//
+//    The reason we don't exec end_parallel for the threads not included
+//    in the parallel loop is that for each barrier in the parallel
+//    region, these non-included threads will cycle through the
+//    syncthread A. Thus they must preserve their current threadId that
+//    is larger than thread in team.
+//
+//    To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+typedef struct ConvergentSimdJob {
+  omptarget_nvptx_TaskDescr taskDescr;
+  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+  uint16_t slimForNextSimd;
+} ConvergentSimdJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent simd (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
+                                          bool *IsFinal, int32_t *LaneSource,
+                                          int32_t *LaneId, int32_t *NumLanes) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
+  uint32_t ConvergentMask = Mask;
+  int32_t ConvergentSize = __popc(ConvergentMask);
+  uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
+  *LaneSource += __ffs(WorkRemaining);
+  *IsFinal = __popc(WorkRemaining) == 1;
+  uint32_t lanemask_lt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+  *LaneId = __popc(ConvergentMask & lanemask_lt);
+
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
+
+  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
+  int32_t SimdLimit =
+      omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
+  job->slimForNextSimd = SimdLimit;
+
+  int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
+  // reset simdlimit to avoid propagating to successive #simd
+  if (SimdLimitSource > 0 && threadId == sourceThreadId)
+    omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
+
+  // We cannot have more than the # of convergent threads.
+  if (SimdLimitSource > 0)
+    *NumLanes = min(ConvergentSize, SimdLimitSource);
+  else
+    *NumLanes = ConvergentSize;
+  ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
+         (int)*NumLanes);
+
+  // Set to true for lanes participating in the simd region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (*LaneId < *NumLanes) {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    omptarget_nvptx_TaskDescr *sourceTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+            sourceThreadId);
+    job->convHeadTaskDescr = currTaskDescr;
+    // install top descriptor from the thread for which the lanes are working.
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               sourceTaskDescr);
+    isActive = true;
+  }
+
+  // requires a memory fence between threads of a warp
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
+  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
+      job->slimForNextSimd;
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, job->convHeadTaskDescr);
+}
+
+typedef struct ConvergentParallelJob {
+  omptarget_nvptx_TaskDescr taskDescr;
+  omptarget_nvptx_TaskDescr *convHeadTaskDescr;
+  uint16_t tnumForNextPar;
+} ConvergentParallelJob;
+
+////////////////////////////////////////////////////////////////////////////////
+// support for convergent parallelism (team of threads in a warp only)
+////////////////////////////////////////////////////////////////////////////////
+EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
+                                              bool *IsFinal,
+                                              int32_t *LaneSource) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
+  uint32_t ConvergentMask = Mask;
+  int32_t ConvergentSize = __popc(ConvergentMask);
+  uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
+  *LaneSource += __ffs(WorkRemaining);
+  *IsFinal = __popc(WorkRemaining) == 1;
+  uint32_t lanemask_lt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+  uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
+
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
+
+  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
+  int32_t NumThreadsClause =
+      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
+  job->tnumForNextPar = NumThreadsClause;
+
+  int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
+  // reset numthreads to avoid propagating to successive #parallel
+  if (NumThreadsSource > 0 && threadId == sourceThreadId)
+    omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+        0;
+
+  // We cannot have more than the # of convergent threads.
+  uint16_t NumThreads;
+  if (NumThreadsSource > 0)
+    NumThreads = min(ConvergentSize, NumThreadsSource);
+  else
+    NumThreads = ConvergentSize;
+  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
+         (int)NumThreads);
+
+  // Set to true for workers participating in the parallel region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (OmpId < NumThreads) {
+    // init L2 task descriptor and storage for the L1 parallel task descriptor.
+    omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
+    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    omptarget_nvptx_TaskDescr *sourceTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
+            sourceThreadId);
+    job->convHeadTaskDescr = currTaskDescr;
+    newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
+    // install new top descriptor
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               newTaskDescr);
+    isActive = true;
+  }
+
+  // requires a memory fence between threads of a warp
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
+  ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, job->convHeadTaskDescr);
+  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
+      job->tnumForNextPar;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes parallel (1 static level only)
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
+                                                uint16_t NThreadsICV,
+                                                uint16_t ThreadLimit) {
+  uint16_t ThreadsRequested = NThreadsICV;
+  if (NumThreadsClause != 0) {
+    ThreadsRequested = NumThreadsClause;
+  }
+
+  uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
+  if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
+    ThreadsAvailable = ThreadLimit;
+  }
+
+  uint16_t NumThreads = ThreadsAvailable;
+  if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
+    NumThreads = ThreadsRequested;
+  }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  // On Volta and newer architectures we require that all lanes in
+  // a warp participate in the parallel region.  Round down to a
+  // multiple of WARPSIZE since it is legal to do so in OpenMP.
+  if (NumThreads < WARPSIZE) {
+    NumThreads = 1;
+  } else {
+    NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
+  }
+#endif
+
+  return NumThreads;
+}
+
+// This routine is always called by the team master..
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+                                           int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
+  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
+
+  omptarget_nvptx_workFn = WorkFn;
+
+  // This routine is only called by the team master.  The team master is
+  // the first thread of the last warp.  It always has the logical thread
+  // id of 0 (since it is a shadow for the first worker thread).
+  const int threadId = 0;
+  omptarget_nvptx_TaskDescr *currTaskDescr =
+      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+  ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+  ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
+          "cannot be called in a parallel region.");
+  if (currTaskDescr->InParallelRegion()) {
+    PRINT0(LD_PAR, "already in parallel: go seq\n");
+    return;
+  }
+
+  uint16_t &NumThreadsClause =
+      omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
+
+  uint16_t NumThreads =
+      determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
+
+  if (NumThreadsClause != 0) {
+    // Reset request to avoid propagating to successive #parallel
+    NumThreadsClause = 0;
+  }
+
+  ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
+         (int)NumThreads);
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
+          "only team master can create parallel");
+
+  // Set number of threads on work descriptor.
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+  workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
+  threadsInTeam = NumThreads;
+}
+
+// All workers call this function.  Deactivate those not needed.
+// Fn - the outlined work function to execute.
+// returns True if this thread is active, else False.
+//
+// Only the worker threads call this routine.
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
+                                   int16_t IsOMPRuntimeInitialized) {
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
+
+  ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime.");
+
+  // Work function and arguments for L1 parallel region.
+  *WorkFn = omptarget_nvptx_workFn;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn) {
+    PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
+    return false;
+  }
+
+  // Only the worker threads call this routine and the master warp
+  // never arrives here.  Therefore, use the nvptx thread id.
+  int threadId = GetThreadIdInBlock();
+  omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
+  // Set to true for workers participating in the parallel region.
+  bool isActive = false;
+  // Initialize state for active threads.
+  if (threadId < threadsInTeam) {
+    // init work descriptor from workdesccr
+    omptarget_nvptx_TaskDescr *newTaskDescr =
+        omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
+    ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
+    newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
+    // install new top descriptor
+    omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                               newTaskDescr);
+    // init private from int value
+    PRINT(LD_PAR,
+          "thread will execute parallel region with id %d in a team of "
+          "%d threads\n",
+          (int)newTaskDescr->ThreadId(), (int)nThreads);
+
+    isActive = true;
+    IncParallelLevel(threadsInTeam != 1);
+  }
+
+  return isActive;
+}
+
+EXTERN void __kmpc_kernel_end_parallel() {
+  // pop stack
+  PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
+
+  // Only the worker threads call this routine and the master warp
+  // never arrives here.  Therefore, use the nvptx thread id.
+  int threadId = GetThreadIdInBlock();
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTaskDescr->GetPrevTaskDescr());
+
+  DecParallelLevel(threadsInTeam != 1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support for parallel that goes sequential
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
+
+  IncParallelLevel(/*ActiveParallel=*/false);
+
+  if (checkRuntimeUninitialized(loc)) {
+    ASSERT0(LT_FUSSY, checkSPMDMode(loc),
+            "Expected SPMD mode with uninitialized runtime.");
+    return;
+  }
+
+  // assume this is only called for nested parallel
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+
+  // unlike actual parallel, threads in the same team do not share
+  // the workTaskDescr in this case and num threads is fixed to 1
+
+  // get current task
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+  currTaskDescr->SaveLoopData();
+
+  // allocate new task descriptor and copy value from current one, set prev to
+  // it
+  omptarget_nvptx_TaskDescr *newTaskDescr =
+      (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
+                                              "new seq parallel task");
+  newTaskDescr->CopyParent(currTaskDescr);
+
+  // tweak values for serialized parallel case:
+  // - each thread becomes ID 0 in its serialized parallel, and
+  // - there is only one thread per team
+  newTaskDescr->ThreadId() = 0;
+
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
+                                                             newTaskDescr);
+}
+
+EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
+                                           uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
+
+  DecParallelLevel(/*ActiveParallel=*/false);
+
+  if (checkRuntimeUninitialized(loc)) {
+    ASSERT0(LT_FUSSY, checkSPMDMode(loc),
+            "Expected SPMD mode with uninitialized runtime.");
+    return;
+  }
+
+  // pop stack
+  int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
+  // set new top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
+      threadId, currTaskDescr->GetPrevTaskDescr());
+  // free
+  SafeFree(currTaskDescr, (char *)"new seq parallel task");
+  currTaskDescr = getMyTopTaskDescriptor(threadId);
+  currTaskDescr->RestoreLoopData();
+}
+
+EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
+
+  return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
+}
+
+// This kmpc call returns the thread id across all teams. It's value is
+// cached by the compiler and used when calling the runtime. On nvptx
+// it's cheap to recalculate this value so we never use the result
+// of this call.
+EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  return GetOmpThreadId(tid, checkSPMDMode(loc));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// push params
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
+                                    int32_t num_threads) {
+  PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
+  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
+      num_threads;
+}
+
+EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
+                                   int32_t simd_limit) {
+  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
+  tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
+}
+
+// Do nothing. The host guarantees we started the requested number of
+// teams and we only need inspection of gridDim.
+
+EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
+                                  int32_t num_teams, int32_t thread_limit) {
+  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
+  ASSERT0(LT_FUSSY, FALSE,
+          "should never have anything with new teams on device");
+}
+
+EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
+                                  int proc_bind) {
+  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu
new file mode 100644
index 0000000..c925638
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -0,0 +1,536 @@
+//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include <complex.h>
+#include <stdio.h>
+
+#include "omptarget-nvptx.h"
+
+EXTERN
+void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
+
+EXTERN
+void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
+
+EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
+  return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
+}
+
+EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
+   int lo, hi;
+   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+   hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
+   lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
+   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+   return val;
+}
+
+INLINE static void gpu_regular_warp_reduce(void *reduce_data,
+                                           kmp_ShuffleReductFctPtr shflFct) {
+  for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
+    shflFct(reduce_data, /*LaneId - not used= */ 0,
+            /*Offset = */ mask, /*AlgoVersion=*/0);
+  }
+}
+
+INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
+                                             kmp_ShuffleReductFctPtr shflFct,
+                                             uint32_t size, uint32_t tid) {
+  uint32_t curr_size;
+  uint32_t mask;
+  curr_size = size;
+  mask = curr_size / 2;
+  while (mask > 0) {
+    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
+    curr_size = (curr_size + 1) / 2;
+    mask = curr_size / 2;
+  }
+}
+
+INLINE static uint32_t
+gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
+  uint32_t lanemask_lt;
+  uint32_t lanemask_gt;
+  uint32_t size, remote_id, physical_lane_id;
+  physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
+  uint32_t Liveness = __ACTIVEMASK();
+  uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
+  do {
+    Liveness = __ACTIVEMASK();
+    remote_id = __ffs(Liveness & lanemask_gt);
+    size = __popc(Liveness);
+    logical_lane_id /= 2;
+    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+  return (logical_lane_id == 0);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                        size_t reduce_size, void *reduce_data,
+                                        kmp_ShuffleReductFctPtr shflFct,
+                                        kmp_InterWarpCopyFctPtr cpyFct) {
+  uint32_t Liveness = __ACTIVEMASK();
+  if (Liveness == 0xffffffff) {
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+    return GetThreadIdInBlock() % WARPSIZE ==
+           0; // Result on lane 0 of the simd warp.
+  } else {
+    return gpu_irregular_simd_reduce(
+        reduce_data, shflFct); // Result on the first active lane.
+  }
+}
+
+INLINE
+static int32_t nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
+  if (NumThreads == 1)
+    return 1;
+  /*
+   * This reduce function handles reduction within a team. It handles
+   * parallel regions in both L1 and L2 parallelism levels. It also
+   * supports Generic, SPMD, and NoOMP modes.
+   *
+   * 1. Reduce within a warp.
+   * 2. Warp master copies value to warp 0 via shared memory.
+   * 3. Warp 0 reduces to a single value.
+   * 4. The reduced value is available in the thread that returns 1.
+   */
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+  uint32_t WarpId = BlockThreadId / WARPSIZE;
+
+  // Volta execution model:
+  // For the Generic execution mode a parallel region either has 1 thread and
+  // beyond that, always a multiple of 32. For the SPMD execution mode we may
+  // have any number of threads.
+  if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/NumThreads % WARPSIZE,
+                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > WARPSIZE) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+  }
+  return BlockThreadId == 0;
+#else
+  uint32_t Liveness = __ACTIVEMASK();
+  if (Liveness == 0xffffffff) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/__popc(Liveness),
+                              /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
+  else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
+                                    // parallel region may enter here; return
+                                    // early.
+    return gpu_irregular_simd_reduce(reduce_data, shflFct);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > WARPSIZE) {
+    uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = BlockThreadId / WARPSIZE;
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+
+    return BlockThreadId == 0;
+  } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
+    return BlockThreadId == 0;
+  }
+
+  // Get the OMP thread Id. This is different from BlockThreadId in the case of
+  // an L2 parallel region.
+  return global_tid == 0;
+#endif // __CUDA_ARCH__ >= 700
+}
+
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
+                                      reduce_data, shflFct, cpyFct,
+                                      isSPMDMode(), isRuntimeUninitialized());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+    void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
+  return nvptx_parallel_reduce_nowait(
+      global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+      /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true);
+}
+
+INLINE
+static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                         size_t reduce_size, void *reduce_data,
+                                         kmp_ShuffleReductFctPtr shflFct,
+                                         kmp_InterWarpCopyFctPtr cpyFct,
+                                         kmp_CopyToScratchpadFctPtr scratchFct,
+                                         kmp_LoadReduceFctPtr ldFct,
+                                         bool isSPMDExecutionMode) {
+  uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads =
+      isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
+                          : /*Master thread only*/ 1;
+  uint32_t TeamId = GetBlockIdInKernel();
+  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  __shared__ volatile bool IsLastTeam;
+
+  // Team masters of all teams write to the scratchpad.
+  if (ThreadId == 0) {
+    unsigned int *timestamp = GetTeamsReductionTimestamp();
+    char *scratchpad = GetTeamsReductionScratchpad();
+
+    scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
+    __threadfence();
+
+    // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
+    // It resets 'timestamp' back to 0 once the last team increments
+    // this counter.
+    unsigned val = atomicInc(timestamp, NumTeams - 1);
+    IsLastTeam = val == NumTeams - 1;
+  }
+
+  // We have to wait on L1 barrier because in GENERIC mode the workers
+  // are waiting on barrier 0 for work.
+  //
+  // If we guard this barrier as follows it leads to deadlock, probably
+  // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
+  uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
+  named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
+
+  // If this team is not the last, quit.
+  if (/* Volatile read by all threads */ !IsLastTeam)
+    return 0;
+
+    //
+    // Last team processing.
+    //
+
+    // Threads in excess of #teams do not participate in reduction of the
+    // scratchpad values.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  uint32_t ActiveThreads = NumThreads;
+  if (NumTeams < NumThreads) {
+    ActiveThreads =
+        (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
+  }
+  if (ThreadId >= ActiveThreads)
+    return 0;
+
+  // Load from scratchpad and reduce.
+  char *scratchpad = GetTeamsReductionScratchpad();
+  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
+  for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
+    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
+
+  uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+  uint32_t WarpId = ThreadId / WARPSIZE;
+
+  // Reduce across warps to the warp master.
+  if ((ActiveThreads % WARPSIZE == 0) ||
+      (WarpId < WarpsNeeded - 1)) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (ActiveThreads > 1) // Partial warp but contiguous lanes
+    // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/ActiveThreads % WARPSIZE,
+                              /*LaneId=*/ThreadId % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  if (ActiveThreads > WARPSIZE) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+  }
+#else
+  if (ThreadId >= NumTeams)
+    return 0;
+
+  // Load from scratchpad and reduce.
+  char *scratchpad = GetTeamsReductionScratchpad();
+  ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
+  for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
+    ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
+
+  // Reduce across warps to the warp master.
+  uint32_t Liveness = __ACTIVEMASK();
+  if (Liveness == 0xffffffff) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/__popc(Liveness),
+                              /*LaneId=*/ThreadId % WARPSIZE);
+
+  // When we have more than [warpsize] number of threads
+  // a block reduction is performed here.
+  uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
+  if (ActiveThreads > WARPSIZE) {
+    uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = ThreadId / WARPSIZE;
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
+  }
+#endif // __CUDA_ARCH__ >= 700
+
+  return ThreadId == 0;
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
+                                         size_t reduce_size, void *reduce_data,
+                                         kmp_ShuffleReductFctPtr shflFct,
+                                         kmp_InterWarpCopyFctPtr cpyFct,
+                                         kmp_CopyToScratchpadFctPtr scratchFct,
+                                         kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, isSPMDMode());
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, /*isSPMDExecutionMode=*/true);
+}
+
+EXTERN
+int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
+    int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
+    kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
+    kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
+  return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
+                                   reduce_data, shflFct, cpyFct, scratchFct,
+                                   ldFct, /*isSPMDExecutionMode=*/false);
+}
+
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
+                                                       int32_t global_tid,
+                                                       kmp_CriticalName *crit) {
+  if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
+    return 0;
+  // The master thread of the team actually does the reduction.
+  while (atomicCAS((uint32_t *)crit, 0, 1))
+    ;
+  return 1;
+}
+
+EXTERN void
+__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
+                                            kmp_CriticalName *crit) {
+  __threadfence_system();
+  (void)atomicExch((uint32_t *)crit, 0);
+}
+
+INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
+  return checkGenericMode(loc) || IsTeamMaster(ThreadId);
+}
+
+INLINE static uint32_t roundToWarpsize(uint32_t s) {
+  if (s < WARPSIZE)
+    return 1;
+  return (s & ~(unsigned)(WARPSIZE - 1));
+}
+
+__device__ static volatile uint32_t IterCnt = 0;
+__device__ static volatile uint32_t Cnt = 0;
+EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    kmp_Ident *loc, int32_t global_tid, void *global_buffer,
+    int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+    kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
+    kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
+    kmp_ListGlobalFctPtr glredFct) {
+
+  // Terminate all threads in non-SPMD mode except for the master thread.
+  if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID())
+    return 0;
+
+  uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads =
+      checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
+                         : /*Master thread only*/ 1;
+  uint32_t TeamId = GetBlockIdInKernel();
+  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  __shared__ unsigned Bound;
+  __shared__ unsigned ChunkTeamCount;
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  bool IsMaster = isMaster(loc, ThreadId);
+  while (IsMaster) {
+    // Atomic read
+    Bound = atomicAdd((uint32_t *)&IterCnt, 0);
+    if (TeamId < Bound + num_of_records)
+      break;
+  }
+
+  if (IsMaster) {
+    int ModBockId = TeamId % num_of_records;
+    if (TeamId < num_of_records)
+      lgcpyFct(global_buffer, ModBockId, reduce_data);
+    else
+      lgredFct(global_buffer, ModBockId, reduce_data);
+    __threadfence_system();
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount = atomicInc((uint32_t *)&Cnt, num_of_records - 1);
+  }
+  // Synchronize
+  if (checkSPMDMode(loc))
+    __kmpc_barrier(loc, global_tid);
+
+  // reduce_data is global or shared so before being reduced within the
+  // warp we need to bring it in local memory:
+  // local_reduce_data = reduce_data[i]
+  //
+  // Example for 3 reduction variables a, b, c (of potentially different
+  // types):
+  //
+  // buffer layout (struct of arrays):
+  // a, a, ..., a, b, b, ... b, c, c, ... c
+  // |__________|
+  //     num_of_records
+  //
+  // local_data_reduce layout (struct):
+  // a, b, c
+  //
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  unsigned NumRecs = min(NumTeams, num_of_records);
+  if (ChunkTeamCount == NumTeams - Bound - 1) {
+    //
+    // Last team processing.
+    //
+    if (ThreadId >= NumRecs)
+      return 0;
+    NumThreads = roundToWarpsize(min(NumThreads, NumRecs));
+    if (ThreadId >= NumThreads)
+      return 0;
+
+    // Load from buffer and reduce.
+    glcpyFct(global_buffer, ThreadId, reduce_data);
+    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
+      glredFct(global_buffer, i, reduce_data);
+
+    // Reduce across warps to the warp master.
+    if (NumThreads > 1) {
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+
+      // When we have more than [warpsize] number of threads
+      // a block reduction is performed here.
+      uint32_t ActiveThreads = min(NumRecs, NumThreads);
+      if (ActiveThreads > WARPSIZE) {
+        uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
+        // Gather all the reduced values from each warp
+        // to the first warp.
+        cpyFct(reduce_data, WarpsNeeded);
+
+        uint32_t WarpId = ThreadId / WARPSIZE;
+        if (WarpId == 0)
+          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                    ThreadId);
+      }
+    }
+
+    if (IsMaster) {
+      Cnt = 0;
+      IterCnt = 0;
+      return 1;
+    }
+    return 0;
+  }
+  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
+    // Allow SIZE number of teams to proceed writing their
+    // intermediate results to the global buffer.
+    atomicAdd((uint32_t *)&IterCnt, num_of_records);
+  }
+
+  return 0;
+}
+
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h b/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h
new file mode 100644
index 0000000..9d7576b
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/state-queue.h
@@ -0,0 +1,51 @@
+//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a queue to hand out OpenMP state objects to teams of
+// one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __STATE_QUEUE_H
+#define __STATE_QUEUE_H
+
+#include <stdint.h>
+
+#include "option.h" // choices we have
+
+template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
+private:
+  ElementType elements[SIZE];
+  volatile ElementType *elementQueue[SIZE];
+  volatile uint32_t head;
+  volatile uint32_t ids[SIZE];
+  volatile uint32_t tail;
+
+  static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
+  INLINE uint32_t ENQUEUE_TICKET();
+  INLINE uint32_t DEQUEUE_TICKET();
+  INLINE static uint32_t ID(uint32_t ticket);
+  INLINE bool IsServing(uint32_t slot, uint32_t id);
+  INLINE void PushElement(uint32_t slot, ElementType *element);
+  INLINE ElementType *PopElement(uint32_t slot);
+  INLINE void DoneServing(uint32_t slot, uint32_t id);
+
+public:
+  INLINE omptarget_nvptx_Queue() {}
+  INLINE void Enqueue(ElementType *element);
+  INLINE ElementType *Dequeue();
+};
+
+#include "state-queuei.h"
+
+#endif
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h b/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
new file mode 100644
index 0000000..3c3be11
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
@@ -0,0 +1,89 @@
+//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of a queue to hand out OpenMP state
+// objects to teams of one or more kernels.
+//
+// Reference:
+// Thomas R.W. Scogland and Wu-chun Feng. 2015.
+// Design and Evaluation of Scalable Concurrent Queues for Many-Core
+// Architectures. International Conference on Performance Engineering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "state-queue.h"
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
+  return atomicAdd((unsigned int *)&tail, 1);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
+  return atomicAdd((unsigned int *)&head, 1);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE uint32_t
+omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
+  return (ticket / SIZE) * 2;
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
+                                                                uint32_t id) {
+  return atomicAdd((unsigned int *)&ids[slot], 0) == id;
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void
+omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
+                                                      ElementType *element) {
+  atomicExch((unsigned long long *)&elementQueue[slot],
+             (unsigned long long)element);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE ElementType *
+omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
+  return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot],
+                                  (unsigned long long)0);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
+                                                                  uint32_t id) {
+  atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE void
+omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
+  uint32_t ticket = ENQUEUE_TICKET();
+  uint32_t slot = ticket % SIZE;
+  uint32_t id = ID(ticket) + 1;
+  while (!IsServing(slot, id))
+    ;
+  PushElement(slot, element);
+  DoneServing(slot, id);
+}
+
+template <typename ElementType, uint32_t SIZE>
+INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
+  uint32_t ticket = DEQUEUE_TICKET();
+  uint32_t slot = ticket % SIZE;
+  uint32_t id = ID(ticket);
+  while (!IsServing(slot, id))
+    ;
+  ElementType *element = PopElement(slot);
+  // This is to populate the queue because of the lack of GPU constructors.
+  if (element == 0)
+    element = &elements[slot];
+  DoneServing(slot, id);
+  return element;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/support.h b/final/libomptarget/deviceRTLs/nvptx/src/support.h
new file mode 100644
index 0000000..4df75ed
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -0,0 +1,95 @@
+//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+enum ExecutionMode {
+  Generic = 0x00u,
+  Spmd = 0x01u,
+  ModeMask = 0x01u,
+};
+
+enum RuntimeMode {
+  RuntimeInitialized = 0x00u,
+  RuntimeUninitialized = 0x02u,
+  RuntimeMask = 0x02u,
+};
+
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+INLINE bool isGenericMode();
+INLINE bool isSPMDMode();
+INLINE bool isRuntimeUninitialized();
+INLINE bool isRuntimeInitialized();
+
+////////////////////////////////////////////////////////////////////////////////
+// get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+// get low level ids of resources
+INLINE int GetThreadIdInBlock();
+INLINE int GetBlockIdInKernel();
+INLINE int GetNumberOfBlocksInKernel();
+INLINE int GetNumberOfThreadsInBlock();
+INLINE unsigned GetWarpId();
+INLINE unsigned GetLaneId();
+
+// get global ids to locate tread/team info (constant regardless of OMP)
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+INLINE int GetMasterThreadID();
+INLINE int GetNumberOfWorkersInTeam();
+
+// get OpenMP thread and team ids
+INLINE int GetOmpThreadId(int threadId,
+                          bool isSPMDExecutionMode);    // omp_thread_num
+INLINE int GetOmpTeamId();                              // omp_team_num
+
+// get OpenMP number of threads and team
+INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+INLINE int GetNumberOfOmpTeams();                           // omp_num_teams
+
+// get OpenMP number of procs
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+
+// masters
+INLINE int IsTeamMaster(int ompThreadId);
+
+// Parallel level
+INLINE void IncParallelLevel(bool ActiveParallel);
+INLINE void DecParallelLevel(bool ActiveParallel);
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+// safe alloc and free
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg);
+// pad to a alignment (power of 2 only)
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+#define ADD_BYTES(_addr, _bytes)                                               \
+  ((void *)((char *)((void *)(_addr)) + (_bytes)))
+#define SUB_BYTES(_addr, _bytes)                                               \
+  ((void *)((char *)((void *)(_addr)) - (_bytes)))
+
+////////////////////////////////////////////////////////////////////////////////
+// Named Barrier Routines
+////////////////////////////////////////////////////////////////////////////////
+INLINE void named_sync(const int barrier, const int num_threads);
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+INLINE unsigned int *GetTeamsReductionTimestamp();
+INLINE char *GetTeamsReductionScratchpad();
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/supporti.h b/final/libomptarget/deviceRTLs/nvptx/src/supporti.h
new file mode 100644
index 0000000..ceb3951
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -0,0 +1,292 @@
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Wrapper implementation to some functions natively supported by the GPU.
+//
+//===----------------------------------------------------------------------===//
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Parameters
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
+  execution_param = EMode;
+  execution_param |= RMode;
+}
+
+INLINE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
+
+INLINE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
+
+INLINE bool isRuntimeUninitialized() {
+  return (execution_param & RuntimeMask) == RuntimeUninitialized;
+}
+
+INLINE bool isRuntimeInitialized() {
+  return (execution_param & RuntimeMask) == RuntimeInitialized;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Execution Modes based on location parameter fields
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE bool checkSPMDMode(kmp_Ident *loc) {
+  if (!loc)
+    return isSPMDMode();
+
+  // If SPMD is true then we are not in the UNDEFINED state so
+  // we can return immediately.
+  if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
+    return true;
+
+  // If not in SPMD mode and runtime required is a valid
+  // combination of flags so we can return immediately.
+  if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
+    return false;
+
+  // We are in underfined state.
+  return isSPMDMode();
+}
+
+INLINE bool checkGenericMode(kmp_Ident *loc) {
+  return !checkSPMDMode(loc);
+}
+
+INLINE bool checkRuntimeUninitialized(kmp_Ident *loc) {
+  if (!loc)
+    return isRuntimeUninitialized();
+
+  // If runtime is required then we know we can't be
+  // in the undefined mode. We can return immediately.
+  if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE))
+    return false;
+
+  // If runtime is required then we need to check is in
+  // SPMD mode or not. If not in SPMD mode then we end
+  // up in the UNDEFINED state that marks the orphaned
+  // functions.
+  if (loc->reserved_2 & KMP_IDENT_SPMD_MODE)
+    return true;
+
+  // Check if we are in an UNDEFINED state. Undefined is denoted by
+  // non-SPMD + noRuntimeRequired which is a combination that
+  // cannot actually happen. Undefined states is used to mark orphaned
+  // functions.
+  return isRuntimeUninitialized();
+}
+
+INLINE bool checkRuntimeInitialized(kmp_Ident *loc) {
+  return !checkRuntimeUninitialized(loc);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// support: get info from machine
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Calls to the NVPTX layer  (assuming 1D layout)
+//
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int GetThreadIdInBlock() { return threadIdx.x; }
+
+INLINE int GetBlockIdInKernel() { return blockIdx.x; }
+
+INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
+
+INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
+
+INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
+
+INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// The master thread id is the first thread (lane) of the last warp.
+// Thread id is 0 indexed.
+// E.g: If NumThreads is 33, master id is 32.
+//      If NumThreads is 64, master id is 32.
+//      If NumThreads is 97, master id is 96.
+//      If NumThreads is 1024, master id is 992.
+//
+// Called in Generic Execution Mode only.
+INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
+
+// The last warp is reserved for the master; other warps are workers.
+// Called in Generic Execution Mode only.
+INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
+
+////////////////////////////////////////////////////////////////////////////////
+// get thread id in team
+
+// This function may be called in a parallel region by the workers
+// or a serial region by the master.  If the master (whose CUDA thread
+// id is GetMasterThreadID()) calls this routine, we return 0 because
+// it is a shadow for the first worker.
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
+  // Implemented using control flow (predication) instead of with a modulo
+  // operation.
+  int tid = GetThreadIdInBlock();
+  if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
+    return 0;
+  else
+    return tid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// OpenMP Thread Support Layer
+//
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
+  // omp_thread_num
+  int rc;
+  if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
+    rc = 0;
+  } else if (isSPMDExecutionMode) {
+    rc = GetThreadIdInBlock();
+  } else {
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
+    ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
+    rc = currTaskDescr->ThreadId();
+  }
+  return rc;
+}
+
+INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
+  // omp_num_threads
+  int rc;
+  int Level = parallelLevel[GetWarpId()];
+  if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
+    rc = 1;
+  } else if (isSPMDExecutionMode) {
+    rc = GetNumberOfThreadsInBlock();
+  } else {
+    rc = threadsInTeam;
+  }
+
+  return rc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Team id linked to OpenMP
+
+INLINE int GetOmpTeamId() {
+  // omp_team_num
+  return GetBlockIdInKernel(); // assume 1 block per team
+}
+
+INLINE int GetNumberOfOmpTeams() {
+  // omp_num_teams
+  return GetNumberOfBlocksInKernel(); // assume 1 block per team
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Masters
+
+INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
+
+////////////////////////////////////////////////////////////////////////////////
+// Parallel level
+
+INLINE void IncParallelLevel(bool ActiveParallel) {
+  unsigned tnum = __ACTIVEMASK();
+  int leader = __ffs(tnum) - 1;
+  __SHFL_SYNC(tnum, leader, leader);
+  if (GetLaneId() == leader) {
+    parallelLevel[GetWarpId()] +=
+        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+  }
+  __SHFL_SYNC(tnum, leader, leader);
+}
+
+INLINE void DecParallelLevel(bool ActiveParallel) {
+  unsigned tnum = __ACTIVEMASK();
+  int leader = __ffs(tnum) - 1;
+  __SHFL_SYNC(tnum, leader, leader);
+  if (GetLaneId() == leader) {
+    parallelLevel[GetWarpId()] -=
+        (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+  }
+  __SHFL_SYNC(tnum, leader, leader);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get OpenMP number of procs
+
+// Get the number of processors in the device.
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
+  if (!isSPMDExecutionMode)
+    return GetNumberOfWorkersInTeam();
+  return GetNumberOfThreadsInBlock();
+}
+
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
+  return GetNumberOfProcsInDevice(isSPMDExecutionMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned long PadBytes(unsigned long size,
+                              unsigned long alignment) // must be a power of 2
+{
+  // compute the necessary padding to satisfy alignment constraint
+  ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
+         "alignment %lu is not a power of 2\n", alignment);
+  return (~(unsigned long)size + 1) & (alignment - 1);
+}
+
+INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
+{
+  void *ptr = malloc(size);
+  PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
+        (unsigned long long)size, msg, (unsigned long long)ptr);
+  return ptr;
+}
+
+INLINE void *SafeFree(void *ptr, const char *msg) {
+  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
+  free(ptr);
+  return NULL;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Named Barrier Routines
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE void named_sync(const int barrier, const int num_threads) {
+  asm volatile("bar.sync %0, %1;"
+               :
+               : "r"(barrier), "r"(num_threads)
+               : "memory");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Teams Reduction Scratchpad Helpers
+////////////////////////////////////////////////////////////////////////////////
+
+INLINE unsigned int *GetTeamsReductionTimestamp() {
+  return static_cast<unsigned int *>(ReductionScratchpadPtr);
+}
+
+INLINE char *GetTeamsReductionScratchpad() {
+  return static_cast<char *>(ReductionScratchpadPtr) + 256;
+}
+
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
+  ReductionScratchpadPtr = ScratchpadPtr;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/sync.cu b/final/libomptarget/deviceRTLs/nvptx/src/sync.cu
new file mode 100644
index 0000000..688420e
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -0,0 +1,143 @@
+//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Ordered calls
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_ordered\n");
+}
+
+EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_end_ordered\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP Barriers
+////////////////////////////////////////////////////////////////////////////////
+
+// a team is a block: we can use CUDA native synchronization mechanism
+// FIXME: what if not all threads (warps) participate to the barrier?
+// We may need to implement it differently
+
+EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
+  __kmpc_barrier(loc_ref, tid);
+  PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
+  return 0;
+}
+
+EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
+  if (checkRuntimeUninitialized(loc_ref)) {
+    ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref),
+            "Expected SPMD mode with uninitialized runtime.");
+    __kmpc_barrier_simple_spmd(loc_ref, tid);
+  } else {
+    tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
+    int numberOfActiveOMPThreads =
+        GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
+    if (numberOfActiveOMPThreads > 1) {
+      if (checkSPMDMode(loc_ref)) {
+        __kmpc_barrier_simple_spmd(loc_ref, tid);
+      } else {
+        // The #threads parameter must be rounded up to the WARPSIZE.
+        int threads =
+            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+        PRINT(LD_SYNC,
+              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+              (int)numberOfActiveOMPThreads, (int)threads);
+        // Barrier #1 is for synchronization among active threads.
+        named_sync(L1_BARRIER, threads);
+      }
+    } // numberOfActiveOMPThreads > 1
+    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
+  }
+}
+
+// Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0
+// parallel region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
+  PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
+  // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
+  __SYNCTHREADS();
+  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
+}
+
+// Emit a simple barrier call in Generic mode.  Assumes the caller is in an L0
+// parallel region and that all worker threads participate.
+EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
+  int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
+  // The #threads parameter must be rounded up to the WARPSIZE.
+  int threads =
+      WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+  PRINT(LD_SYNC,
+        "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
+        "%d\n",
+        (int)numberOfActiveOMPThreads, (int)threads);
+  // Barrier #1 is for synchronization among active threads.
+  named_sync(L1_BARRIER, threads);
+  PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP MASTER
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_master\n");
+  return IsTeamMaster(global_tid);
+}
+
+EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_end_master\n");
+  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP SINGLE
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_single\n");
+  // decide to implement single with master; master get the single
+  return IsTeamMaster(global_tid);
+}
+
+EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_end_single\n");
+  // decide to implement single with master: master get the single
+  ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
+  // sync barrier is explicitely called... so that is not a problem
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Flush
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN void __kmpc_flush(kmp_Ident *loc) {
+  PRINT0(LD_IO, "call kmpc_flush\n");
+  __threadfence();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Vote
+////////////////////////////////////////////////////////////////////////////////
+
+EXTERN int32_t __kmpc_warp_active_thread_mask() {
+  PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
+  return __ACTIVEMASK();
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/src/task.cu b/final/libomptarget/deviceRTLs/nvptx/src/task.cu
new file mode 100644
index 0000000..d618ff1
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -0,0 +1,216 @@
+//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+//  explicit task structure uses
+//  omptarget_nvptx task
+//  kmp_task
+//
+//  where kmp_task is
+//    - klegacy_TaskDescr    <- task pointer
+//        shared -> X
+//        routine
+//        part_id
+//        descr
+//    -  private (of size given by task_alloc call). Accessed by
+//       task+sizeof(klegacy_TaskDescr)
+//        * private data *
+//    - shared: X. Accessed by shared ptr in klegacy_TaskDescr
+//        * pointer table to shared variables *
+//    - end
+//
+//===----------------------------------------------------------------------===//
+
+#include "omptarget-nvptx.h"
+
+EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
+    kmp_Ident *loc,     // unused
+    uint32_t global_tid, // unused
+    int32_t flag, // unused (because in our impl, all are immediately exec
+    size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
+    kmp_TaskFctPtr taskSub) {
+  PRINT(LD_IO,
+        "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
+        "fct 0x%llx)\n",
+        (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
+        (unsigned long long)taskSub);
+  // want task+priv to be a multiple of 8 bytes
+  size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
+  sizeOfTaskInclPrivate += padForTaskInclPriv;
+  size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
+  ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
+         "need task descr of size %d to be a multiple of %d\n",
+         (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
+  size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
+          totSize, "explicit task descriptor");
+  kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
+  ASSERT0(LT_FUSSY,
+          (uint64_t)newKmpTaskDescr ==
+              (uint64_t)ADD_BYTES(newExplicitTaskDescr,
+                                  sizeof(omptarget_nvptx_TaskDescr)),
+          "bad size assumptions");
+  // init kmp_TaskDescr
+  newKmpTaskDescr->sharedPointerTable =
+      (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
+  newKmpTaskDescr->sub = taskSub;
+  newKmpTaskDescr->destructors = NULL;
+  PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
+        (unsigned long long)newKmpTaskDescr,
+        (unsigned long long)newExplicitTaskDescr);
+
+  return newKmpTaskDescr;
+}
+
+EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
+                               kmp_TaskDescr *newKmpTaskDescr) {
+  return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
+                                   0);
+}
+
+EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newKmpTaskDescr,
+                                         int32_t depNum, void *depList,
+                                         int32_t noAliasDepNum,
+                                         void *noAliasDepList) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
+        P64(newKmpTaskDescr));
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Runtime must be initialized.");
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
+
+  // 3. call sub
+  PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
+        (unsigned long long)newKmpTaskDescr->sub,
+        (unsigned long long)newKmpTaskDescr);
+  newKmpTaskDescr->sub(0, newKmpTaskDescr);
+  PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
+        (unsigned long long)newKmpTaskDescr->sub);
+
+  // 4. pop context
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+  return 0;
+}
+
+EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
+                                      kmp_TaskDescr *newKmpTaskDescr) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
+        (unsigned long long)newKmpTaskDescr);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Runtime must be initialized.");
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+
+  // 2. push new context: update new task descriptor
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
+  newTaskDescr->CopyForExplicitTask(parentTaskDescr);
+  // set new task descriptor as top
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
+  // 3... noting to call... is inline
+  // 4 & 5 ... done in complete
+}
+
+EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
+                                         kmp_TaskDescr *newKmpTaskDescr) {
+  PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
+        (unsigned long long)newKmpTaskDescr);
+  ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
+          "Runtime must be initialized.");
+  // 1. get explict task descr from kmp task descr
+  omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
+      (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
+          newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
+  ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
+          "bad assumptions");
+  omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
+  ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
+          "bad assumptions");
+  // 2. get parent
+  omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
+  // 3... noting to call... is inline
+  // 4. pop context
+  int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
+  omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
+                                                             parentTaskDescr);
+  // 5. free
+  SafeFree(newExplicitTaskDescr, "explicit task descriptor");
+}
+
+EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
+                                 int32_t depNum, void *depList,
+                                 int32_t noAliasDepNum, void *noAliasDepList) {
+  PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
+  // nothing to do as all our tasks are executed as final
+}
+
+EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
+                                    int end_part) {
+  PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
+  // do nothing: tasks are executed immediately, no yielding allowed
+  return 0;
+}
+
+EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) {
+  PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
+  // nothing to do as all our tasks are executed as final
+  return 0;
+}
+
+EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
+                            kmp_TaskDescr *newKmpTaskDescr, int if_val,
+                            uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
+                            int32_t sched, uint64_t grainsize, void *task_dup) {
+
+  // skip task entirely if empty iteration space
+  if (*lb > *ub)
+    return;
+
+  // the compiler has already stored lb and ub in the kmp_TaskDescr structure
+  // as we are using a single task to execute the entire loop, we can leave
+  // the initial task_t untouched
+
+  __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt b/final/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
new file mode 100644
index 0000000..33945d1
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt
@@ -0,0 +1,26 @@
+if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
+  # Silently return, no need to annoy the user.
+  return()
+endif()
+
+set(deps omptarget-nvptx omptarget omp)
+if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
+  set(deps ${deps} omptarget-nvptx-bc)
+endif()
+
+# Don't run by default.
+set(EXCLUDE_FROM_ALL True)
+# Run with only one thread to only launch one application to the GPU at a time.
+add_openmp_testsuite(check-libomptarget-nvptx
+    "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${deps} ARGS -j1)
+
+set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
+    "Extra compiler flags to send to the test compiler.")
+set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS
+    "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING
+    "OpenMP compiler flags to use for testing libomptarget-nvptx.")
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/api/ignored.c b/final/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
new file mode 100644
index 0000000..1fa9ae0
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/api/ignored.c
@@ -0,0 +1,38 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+const int MaxThreads = 1024;
+
+int main(int argc, char *argv[]) {
+  int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1;
+
+  #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels)
+  {
+    // libomptarget-nvptx doesn't support cancellation.
+    cancellation = omp_get_cancellation();
+
+    // No support for dynamic adjustment of the number of threads.
+    omp_set_dynamic(1);
+    dynamic = omp_get_dynamic();
+
+    // libomptarget-nvptx doesn't support nested parallelism.
+    omp_set_nested(1);
+    nested = omp_get_nested();
+
+    omp_set_max_active_levels(42);
+    maxActiveLevels = omp_get_max_active_levels();
+  }
+
+  // CHECK: cancellation = 0
+  printf("cancellation = %d\n", cancellation);
+  // CHECK: dynamic = 0
+  printf("dynamic = %d\n", dynamic);
+  // CHECK: nested = 0
+  printf("nested = %d\n", nested);
+  // CHECK: maxActiveLevels = 1
+  printf("maxActiveLevels = %d\n", maxActiveLevels);
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c b/final/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
new file mode 100644
index 0000000..d0d9f31
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c
@@ -0,0 +1,46 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int MaxThreadsL1 = -1, MaxThreadsL2 = -1;
+
+#pragma omp declare reduction(unique:int                                       \
+                              : omp_out = (omp_in == 1 ? omp_in : omp_out))    \
+    initializer(omp_priv = -1)
+
+  // Non-SPMD mode.
+#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32)      \
+    num_teams(1)
+  {
+    MaxThreadsL1 = omp_get_max_threads();
+#pragma omp parallel reduction(unique : MaxThreadsL2)
+    { MaxThreadsL2 = omp_get_max_threads(); }
+  }
+
+  // CHECK: Non-SPMD MaxThreadsL1 = 32
+  printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
+  // CHECK: Non-SPMD MaxThreadsL2 = 1
+  printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
+
+  // SPMD mode with full runtime
+  MaxThreadsL2 = -1;
+#pragma omp target parallel reduction(unique : MaxThreadsL2)
+  { MaxThreadsL2 = omp_get_max_threads(); }
+
+  // CHECK: SPMD with full runtime MaxThreadsL2 = 1
+  printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
+
+  // SPMD mode without runtime
+  MaxThreadsL2 = -1;
+#pragma omp target parallel for reduction(unique : MaxThreadsL2)
+  for (int I = 0; I < 2; ++I) {
+    MaxThreadsL2 = omp_get_max_threads();
+  }
+
+  // CHECK: SPMD without runtime MaxThreadsL2 = 1
+  printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c b/final/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
new file mode 100644
index 0000000..626d620
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c
@@ -0,0 +1,72 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1;
+
+#pragma omp declare reduction(unique64:int                                     \
+                              : omp_out = (omp_in == 64 ? omp_in : omp_out))   \
+    initializer(omp_priv = -1)
+#pragma omp declare reduction(unique32:int                                     \
+                              : omp_out = (omp_in == 32 ? omp_in : omp_out))   \
+    initializer(omp_priv = -1)
+
+  // Non-SPMD mode.
+#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2)      \
+    thread_limit(64) num_teams(1)
+  {
+    ThreadLimitL0 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique64                                        \
+                               : ThreadLimitL1, ThreadLimitL2) num_threads(32)
+    {
+      ThreadLimitL1 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique64 : ThreadLimitL2)
+      { ThreadLimitL2 = omp_get_thread_limit(); }
+    }
+  }
+
+  // CHECK: Non-SPMD ThreadLimitL0 = 64
+  printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0);
+  // CHECK: Non-SPMD ThreadLimitL1 = 64
+  printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1);
+  // CHECK: Non-SPMD ThreadLimitL2 = 64
+  printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2);
+
+  // SPMD mode with full runtime
+  ThreadLimitL1 = -1;
+  ThreadLimitL2 = -1;
+#pragma omp target parallel reduction(unique32                                 \
+                                      : ThreadLimitL1, ThreadLimitL2)          \
+    num_threads(32)
+  {
+    ThreadLimitL1 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique32 : ThreadLimitL2)
+    { ThreadLimitL2 = omp_get_thread_limit(); }
+  }
+
+  // CHECK: SPMD with full runtime ThreadLimitL1 = 32
+  printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
+  // CHECK: SPMD with full runtime ThreadLimitL2 = 32
+  printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
+
+  // SPMD mode without runtime
+  ThreadLimitL1 = -1;
+  ThreadLimitL2 = -1;
+#pragma omp target parallel for reduction(unique32                             \
+                                          : ThreadLimitL1, ThreadLimitL2)      \
+    num_threads(32)
+  for (int I = 0; I < 2; ++I) {
+    ThreadLimitL1 = omp_get_thread_limit();
+#pragma omp parallel reduction(unique32 : ThreadLimitL2)
+    { ThreadLimitL2 = omp_get_thread_limit(); }
+  }
+
+  // CHECK: SPMD without runtime ThreadLimitL1 = 32
+  printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
+  // CHECK: SPMD without runtime ThreadLimitL2 = 32
+  printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c b/final/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
new file mode 100644
index 0000000..dd17ae7
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c
@@ -0,0 +1,55 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp declare target
+static void putValueInParallel(int *ptr, int value) {
+  #pragma omp parallel
+  {
+    *ptr = value;
+  }
+}
+
+static int getId() {
+  int id;
+  putValueInParallel(&id, omp_get_thread_num());
+  return id;
+}
+#pragma omp end declare target
+
+const int MaxThreads = 1024;
+const int Threads = 64;
+
+int main(int argc, char *argv[]) {
+  int master;
+  int check[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check[i] = 0;
+  }
+
+  #pragma omp target map(master, check[:])
+  {
+    master = getId();
+
+    #pragma omp parallel num_threads(Threads)
+    {
+      check[omp_get_thread_num()] = getId();
+    }
+  }
+
+  // CHECK: master = 0.
+  printf("master = %d.\n", master);
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    if (i < Threads) {
+      if (check[i] != i) {
+        printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]);
+      }
+    } else if (check[i] != 0) {
+      printf("invalid: check[%d] should be 0, is %d\n", i, check[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/lit.cfg b/final/libomptarget/deviceRTLs/nvptx/test/lit.cfg
new file mode 100644
index 0000000..0774c25
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/lit.cfg
@@ -0,0 +1,69 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def prepend_library_path(name, value, sep):
+    if name in config.environment:
+        config.environment[name] = value + sep + config.environment[name]
+    else:
+        config.environment[name] = value
+
+# name: The name of this test suite.
+config.name = 'libomptarget-nvptx'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp', '.cc']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.binary_dir
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.omp_header_directory + \
+    " -L " + config.library_dir + \
+    " --libomptarget-nvptx-path=" + config.library_dir;
+
+if config.omp_host_rtl_directory:
+    config.test_flags = config.test_flags + \
+        " -L " + config.omp_host_rtl_directory
+
+config.test_flags = config.test_flags + " " + config.test_extra_flags
+
+# Setup environment to find dynamic library at runtime.
+prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
+prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
+
+# Forbid fallback to host.
+config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
+
+# substitutions
+config.substitutions.append(("%compilexx-run-and-check",
+    "%compilexx-and-run | " + config.libomptarget_filecheck + " %s"))
+config.substitutions.append(("%compile-run-and-check",
+    "%compile-and-run | " + config.libomptarget_filecheck + " %s"))
+config.substitutions.append(("%compilexx-and-run", "%compilexx && %run"))
+config.substitutions.append(("%compile-and-run", "%compile && %run"))
+
+config.substitutions.append(("%compilexx",
+    "%clangxx %openmp_flags %flags %s -o %t"))
+config.substitutions.append(("%compile",
+    "%clang %openmp_flags %flags %s -o %t"))
+
+config.substitutions.append(("%clangxx", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%flags", config.test_flags))
+
+config.substitutions.append(("%run", "%t"))
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in b/final/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
new file mode 100644
index 0000000..d9c14cb
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in
@@ -0,0 +1,14 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@"
+config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
+config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
+config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
new file mode 100644
index 0000000..412538b
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c
@@ -0,0 +1,35 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main(int argc, char *argv[]) {
+  int data, out, flag = 0;
+#pragma omp target parallel num_threads(64) map(tofrom                         \
+                                                : out, flag) map(to            \
+                                                                 : data)
+  {
+    if (omp_get_thread_num() == 0) {
+      /* Write to the data buffer that will be read by thread */
+      data = 42;
+/* Flush data to thread 32 */
+#pragma omp flush(data)
+      /* Set flag to release thread 32 */
+#pragma omp atomic write
+      flag = 1;
+    } else if (omp_get_thread_num() == 32) {
+      /* Loop until we see the update to the flag */
+      int val;
+      do {
+#pragma omp atomic read
+        val = flag;
+      } while (val < 1);
+      out = data;
+#pragma omp flush(out)
+    }
+  }
+  // CHECK: out=42.
+  /* Value of out will be 42 */
+  printf("out=%d.\n", out);
+  return !(out == 42);
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
new file mode 100644
index 0000000..edb00e0
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
@@ -0,0 +1,139 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+const int MaxThreads = 1024;
+const int NumThreads = 64;
+
+int main(int argc, char *argv[]) {
+  int level = -1, activeLevel = -1;
+  // The expected value is -1, initialize to different value.
+  int ancestorTNumNeg = 1, teamSizeNeg = 1;
+  int ancestorTNum0 = -1, teamSize0 = -1;
+  // The expected value is -1, initialize to different value.
+  int ancestorTNum1 = 1, teamSize1 = 1;
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  int check3[MaxThreads];
+  int check4[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = check3[i] = check4[i] = 0;
+  }
+
+  #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \
+                     map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \
+                     map(check1[:], check2[:], check3[:], check4[:])
+  {
+    level = omp_get_level();
+    activeLevel = omp_get_active_level();
+
+    // Expected to return -1.
+    ancestorTNumNeg = omp_get_ancestor_thread_num(-1);
+    teamSizeNeg = omp_get_team_size(-1);
+
+    // Expected to return 0 and 1.
+    ancestorTNum0 = omp_get_ancestor_thread_num(0);
+    teamSize0 = omp_get_team_size(0);
+
+    // Expected to return -1 because the requested level is larger than
+    // the nest level.
+    ancestorTNum1 = omp_get_ancestor_thread_num(1);
+    teamSize1 = omp_get_team_size(1);
+
+    // Expecting active parallel region.
+    #pragma omp parallel num_threads(NumThreads)
+    {
+      int id = omp_get_thread_num();
+      // Multiply return value of omp_get_level by 5 to avoid that this test
+      // passes if both API calls return wrong values.
+      check1[id] += omp_get_level() * 5 + omp_get_active_level();
+
+      // Expected to return 0 and 1.
+      check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
+      // Expected to return the current thread num.
+      check2[id] += (omp_get_ancestor_thread_num(1) - id);
+      // Exepcted to return the current number of threads.
+      check2[id] += 3 * omp_get_team_size(1);
+      // Expected to return -1, see above.
+      check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
+
+      // Expecting serialized parallel region.
+      #pragma omp parallel
+      {
+        #pragma omp atomic
+        check3[id] += omp_get_level() * 5 + omp_get_active_level();
+
+        // Expected to return 0 and 1.
+        int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
+        // Expected to return the parent thread num.
+        check4Inc += (omp_get_ancestor_thread_num(1) - id);
+        // Exepcted to return the number of threads in the active parallel region.
+        check4Inc += 3 * omp_get_team_size(1);
+        // Exptected to return 0 and 1.
+        check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
+        // Expected to return -1, see above.
+        check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
+
+        #pragma omp atomic
+        check4[id] += check4Inc;
+      }
+    }
+  }
+
+  // CHECK: target: level = 0, activeLevel = 0
+  printf("target: level = %d, activeLevel = %d\n", level, activeLevel);
+  // CHECK: level = -1: ancestorTNum = -1, teamSize = -1
+  printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg);
+  // CHECK: level = 0: ancestorTNum = 0, teamSize = 1
+  printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0);
+  // CHECK: level = 1: ancestorTNum = -1, teamSize = -1
+  printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    // Check active parallel region:
+    // omp_get_level() = 1, omp_get_active_level() = 1
+    const int Expected1 = 6;
+    if (i < NumThreads) {
+      if (check1[i] != Expected1) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    // 5 * 1 + 3 * 64 - 1 - 1 (see above)
+    const int Expected2 = 195;
+    if (i < NumThreads) {
+      if (check2[i] != Expected2) {
+        printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+
+    // Check serialized parallel region:
+    // omp_get_level() = 2, omp_get_active_level() = 1
+    const int Expected3 = 11;
+    if (i < NumThreads) {
+      if (check3[i] != Expected3) {
+        printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]);
+      }
+    } else if (check3[i] != 0) {
+      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
+    }
+
+    // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above)
+    const int Expected4 = 198;
+    if (i < NumThreads) {
+      if (check4[i] != Expected4) {
+        printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]);
+      }
+    } else if (check4[i] != 0) {
+      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
new file mode 100644
index 0000000..70ebb1d
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c
@@ -0,0 +1,136 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+const int MaxThreads = 1024;
+const int NumThreads = 64;
+const int NumThreads1 = 1;
+
+int main(int argc, char *argv[]) {
+  int inParallel = -1, numThreads = -1, threadNum = -1;
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = 0;
+  }
+
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+  {
+    inParallel = omp_in_parallel();
+    numThreads = omp_get_num_threads();
+    threadNum = omp_get_thread_num();
+
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads)
+    {
+      int id = omp_get_thread_num();
+      check1[id] += omp_get_num_threads() + omp_in_parallel();
+
+// Expecting serialized parallel region.
+#pragma omp parallel
+      {
+        // Expected to be 1.
+        int nestedInParallel = omp_in_parallel();
+        // Expected to be 1.
+        int nestedNumThreads = omp_get_num_threads();
+        // Expected to be 0.
+        int nestedThreadNum = omp_get_thread_num();
+#pragma omp atomic
+        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
+      }
+    }
+  }
+
+  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
+  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
+         inParallel, numThreads, threadNum);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    // Check that all threads reported
+    // omp_get_num_threads() = 64, omp_in_parallel() = 1.
+    int Expected = NumThreads + 1;
+    if (i < NumThreads) {
+      if (check1[i] != Expected) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+               check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    // Check serialized parallel region.
+    if (i < NumThreads) {
+      if (check2[i] != 2) {
+        printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+  }
+
+  inParallel = -1;
+  numThreads = -1;
+  threadNum = -1;
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = 0;
+  }
+
+#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
+  {
+    inParallel = omp_in_parallel();
+    numThreads = omp_get_num_threads();
+    threadNum = omp_get_thread_num();
+
+// Expecting active parallel region.
+#pragma omp parallel num_threads(NumThreads1)
+    {
+      int id = omp_get_thread_num();
+      check1[id] += omp_get_num_threads() + omp_in_parallel();
+
+// Expecting serialized parallel region.
+#pragma omp parallel
+      {
+        // Expected to be 0.
+        int nestedInParallel = omp_in_parallel();
+        // Expected to be 1.
+        int nestedNumThreads = omp_get_num_threads();
+        // Expected to be 0.
+        int nestedThreadNum = omp_get_thread_num();
+#pragma omp atomic
+        check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
+      }
+    }
+  }
+
+  // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
+  printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
+         inParallel, numThreads, threadNum);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    // Check that all threads reported
+    // omp_get_num_threads() = 1, omp_in_parallel() = 0.
+    int Expected = 1;
+    if (i < NumThreads1) {
+      if (check1[i] != Expected) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
+               check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    // Check serialized parallel region.
+    if (i < NumThreads1) {
+      if (check2[i] != 1) {
+        printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
new file mode 100644
index 0000000..4a2f73f
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c
@@ -0,0 +1,102 @@
+// RUN: %compile-run-and-check
+
+#include <stdio.h>
+#include <omp.h>
+
+const int WarpSize = 32;
+const int NumThreads1 = 1 * WarpSize;
+const int NumThreads2 = 2 * WarpSize;
+const int NumThreads3 = 3 * WarpSize;
+const int MaxThreads = 1024;
+
+int main(int argc, char *argv[]) {
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  int check3[MaxThreads];
+  int check4[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = check3[i] = check4[i] = 0;
+  }
+
+  int maxThreads1 = -1;
+  int maxThreads2 = -1;
+  int maxThreads3 = -1;
+
+  #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
+                     map(maxThreads1, maxThreads2, maxThreads3)
+  {
+    #pragma omp parallel num_threads(NumThreads1)
+    {
+      check1[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    // API method to set number of threads in parallel regions without
+    // num_threads() clause.
+    omp_set_num_threads(NumThreads2);
+    maxThreads1 = omp_get_max_threads();
+    #pragma omp parallel
+    {
+      check2[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    maxThreads2 = omp_get_max_threads();
+
+    // num_threads() clause should override nthreads-var ICV.
+    #pragma omp parallel num_threads(NumThreads3)
+    {
+      check3[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    maxThreads3 = omp_get_max_threads();
+
+    // Effect from omp_set_num_threads() should still be visible.
+    #pragma omp parallel
+    {
+      check4[omp_get_thread_num()] += omp_get_num_threads();
+    }
+  }
+
+  // CHECK: maxThreads1 = 64
+  printf("maxThreads1 = %d\n", maxThreads1);
+  // CHECK: maxThreads2 = 64
+  printf("maxThreads2 = %d\n", maxThreads2);
+  // CHECK: maxThreads3 = 64
+  printf("maxThreads3 = %d\n", maxThreads3);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    if (i < NumThreads1) {
+      if (check1[i] != NumThreads1) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    if (i < NumThreads2) {
+      if (check2[i] != NumThreads2) {
+        printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+
+    if (i < NumThreads3) {
+      if (check3[i] != NumThreads3) {
+        printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
+      }
+    } else if (check3[i] != 0) {
+      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
+    }
+
+    if (i < NumThreads2) {
+      if (check4[i] != NumThreads2) {
+        printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
+      }
+    } else if (check4[i] != 0) {
+      printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp b/final/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
new file mode 100644
index 0000000..517db59
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp
@@ -0,0 +1,51 @@
+// RUN: %compilexx-run-and-check
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = -1;
+  int ParallelLevel1 = -1, ParallelLevel2 = -1;
+  int Count = 0;
+
+#pragma omp target parallel for map(tofrom                                     \
+                                    : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
+  for (int J = 0; J < 10; ++J) {
+#pragma omp critical
+    {
+      isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
+      ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
+                           ? omp_get_level()
+                           : ParallelLevel1;
+    }
+    if (omp_get_thread_num() > 5) {
+      int L2;
+#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
+      for (int I = 0; I < 10; ++I) {
+        L2 = omp_get_level();
+        Count += omp_get_level(); // (10-6)*10*2 = 80
+      }
+#pragma omp critical
+      ParallelLevel2 =
+          (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
+    } else {
+      Count += omp_get_level(); // 6 * 1 = 6
+    }
+  }
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+  // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
+  printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
+         ParallelLevel2);
+  // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
+  // level) + 6(num of iterations) * 1(par level)
+  // CHECK: Expected count = 86
+  printf("Expected count = %d\n", Count);
+
+  return isHost;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
new file mode 100644
index 0000000..5e40bb5
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c
@@ -0,0 +1,77 @@
+// RUN: %compile-run-and-check
+
+#include <stdio.h>
+#include <omp.h>
+
+const int WarpSize = 32;
+const int ThreadLimit = 1 * WarpSize;
+const int NumThreads2 = 2 * WarpSize;
+const int NumThreads3 = 3 * WarpSize;
+const int MaxThreads = 1024;
+
+int main(int argc, char *argv[]) {
+  int check1[MaxThreads];
+  int check2[MaxThreads];
+  int check3[MaxThreads];
+  for (int i = 0; i < MaxThreads; i++) {
+    check1[i] = check2[i] = check3[i] = 0;
+  }
+
+  int threadLimit = -1;
+
+  #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \
+                           map(check1[:], check2[:], check3[:], threadLimit)
+  {
+    threadLimit = omp_get_thread_limit();
+
+    // All parallel regions should get as many threads as specified by the
+    // thread_limit() clause.
+    #pragma omp parallel
+    {
+      check1[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    omp_set_num_threads(NumThreads2);
+    #pragma omp parallel
+    {
+      check2[omp_get_thread_num()] += omp_get_num_threads();
+    }
+
+    #pragma omp parallel num_threads(NumThreads3)
+    {
+      check3[omp_get_thread_num()] += omp_get_num_threads();
+    }
+  }
+
+  // CHECK: threadLimit = 32
+  printf("threadLimit = %d\n", threadLimit);
+
+  // CHECK-NOT: invalid
+  for (int i = 0; i < MaxThreads; i++) {
+    if (i < ThreadLimit) {
+      if (check1[i] != ThreadLimit) {
+        printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]);
+      }
+    } else if (check1[i] != 0) {
+      printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
+    }
+
+    if (i < ThreadLimit) {
+      if (check2[i] != ThreadLimit) {
+        printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]);
+      }
+    } else if (check2[i] != 0) {
+      printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
+    }
+
+    if (i < ThreadLimit) {
+      if (check3[i] != ThreadLimit) {
+        printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]);
+      }
+    } else if (check3[i] != 0) {
+      printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/final/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c b/final/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
new file mode 100644
index 0000000..b3f8768
--- /dev/null
+++ b/final/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c
@@ -0,0 +1,22 @@
+// RUN: %compile-run-and-check
+
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int res = 0;
+
+#pragma omp parallel num_threads(2) reduction(+:res)
+  {
+    int tid = omp_get_thread_num();
+#pragma omp target teams distribute reduction(+:res)
+    for (int i = tid; i < 2; i++)
+      ++res;
+  }
+  // The first thread makes 2 iterations, the second - 1. Expected result of the
+  // reduction res is 3.
+
+  // CHECK: res = 3.
+  printf("res = %d.\n", res);
+  return 0;
+}
diff --git a/final/libomptarget/include/omptarget.h b/final/libomptarget/include/omptarget.h
new file mode 100644
index 0000000..ff6e85c
--- /dev/null
+++ b/final/libomptarget/include/omptarget.h
@@ -0,0 +1,250 @@
+//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_H_
+#define _OMPTARGET_H_
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define OFFLOAD_SUCCESS (0)
+#define OFFLOAD_FAIL (~0)
+
+#define OFFLOAD_DEVICE_DEFAULT     -1
+#define HOST_DEVICE                -10
+
+/// Data attributes for each data reference used in an OpenMP target region.
+enum tgt_map_type {
+  // No flags
+  OMP_TGT_MAPTYPE_NONE            = 0x000,
+  // copy data from host to device
+  OMP_TGT_MAPTYPE_TO              = 0x001,
+  // copy data from device to host
+  OMP_TGT_MAPTYPE_FROM            = 0x002,
+  // copy regardless of the reference count
+  OMP_TGT_MAPTYPE_ALWAYS          = 0x004,
+  // force unmapping of data
+  OMP_TGT_MAPTYPE_DELETE          = 0x008,
+  // map the pointer as well as the pointee
+  OMP_TGT_MAPTYPE_PTR_AND_OBJ     = 0x010,
+  // pass device base address to kernel
+  OMP_TGT_MAPTYPE_TARGET_PARAM    = 0x020,
+  // return base device address of mapped data
+  OMP_TGT_MAPTYPE_RETURN_PARAM    = 0x040,
+  // private variable - not mapped
+  OMP_TGT_MAPTYPE_PRIVATE         = 0x080,
+  // copy by value - not mapped
+  OMP_TGT_MAPTYPE_LITERAL         = 0x100,
+  // mapping is implicit
+  OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
+  // member of struct, member given by [16 MSBs] - 1
+  OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
+};
+
+enum OpenMPOffloadingDeclareTargetFlags {
+  /// Mark the entry as having a 'link' attribute.
+  OMP_DECLARE_TARGET_LINK = 0x01,
+  /// Mark the entry as being a global constructor.
+  OMP_DECLARE_TARGET_CTOR = 0x02,
+  /// Mark the entry as being a global destructor.
+  OMP_DECLARE_TARGET_DTOR = 0x04
+};
+
+enum OpenMPOffloadingRequiresDirFlags {
+  /// flag undefined.
+  OMP_REQ_UNDEFINED               = 0x000,
+  /// no requires directive present.
+  OMP_REQ_NONE                    = 0x001,
+  /// reverse_offload clause.
+  OMP_REQ_REVERSE_OFFLOAD         = 0x002,
+  /// unified_address clause.
+  OMP_REQ_UNIFIED_ADDRESS         = 0x004,
+  /// unified_shared_memory clause.
+  OMP_REQ_UNIFIED_SHARED_MEMORY   = 0x008,
+  /// dynamic_allocators clause.
+  OMP_REQ_DYNAMIC_ALLOCATORS      = 0x010
+};
+
+/// This struct is a record of an entry point or global. For a function
+/// entry point the size is expected to be zero
+struct __tgt_offload_entry {
+  void *addr;   // Pointer to the offload entry info (function or global)
+  char *name;   // Name of the function or global
+  size_t size;  // Size of the entry info (0 if it is a function)
+  int32_t flags; // Flags associated with the entry, e.g. 'link'.
+  int32_t reserved; // Reserved, to be used by the runtime library.
+};
+
+/// This struct is a record of the device image information
+struct __tgt_device_image {
+  void *ImageStart;                  // Pointer to the target code start
+  void *ImageEnd;                    // Pointer to the target code end
+  __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries
+  __tgt_offload_entry *EntriesEnd;   // End of table (non inclusive)
+};
+
+/// This struct is a record of all the host code that may be offloaded to a
+/// target.
+struct __tgt_bin_desc {
+  int32_t NumDeviceImages;           // Number of device types supported
+  __tgt_device_image *DeviceImages;  // Array of device images (1 per dev. type)
+  __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries
+  __tgt_offload_entry *HostEntriesEnd;   // End of table (non inclusive)
+};
+
+/// This struct contains the offload entries identified by the target runtime
+struct __tgt_target_table {
+  __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
+  __tgt_offload_entry
+      *EntriesEnd; // End of the table with all the entries (non inclusive)
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int omp_get_num_devices(void);
+int omp_get_initial_device(void);
+void *omp_target_alloc(size_t size, int device_num);
+void omp_target_free(void *device_ptr, int device_num);
+int omp_target_is_present(void *ptr, int device_num);
+int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset,
+    size_t src_offset, int dst_device, int src_device);
+int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
+    int num_dims, const size_t *volume, const size_t *dst_offsets,
+    const size_t *src_offsets, const size_t *dst_dimensions,
+    const size_t *src_dimensions, int dst_device, int src_device);
+int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size,
+    size_t device_offset, int device_num);
+int omp_target_disassociate_ptr(void *host_ptr, int device_num);
+
+/// add the clauses of the requires directives in a given file
+void __tgt_register_requires(int64_t flags);
+
+/// adds a target shared library to the target execution image
+void __tgt_register_lib(__tgt_bin_desc *desc);
+
+/// removes a target shared library from the target execution image
+void __tgt_unregister_lib(__tgt_bin_desc *desc);
+
+// creates the host to target data mapping, stores it in the
+// libomptarget.so internal structure (an entry in a stack of data maps) and
+// passes the data to the device;
+void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+                             void **args_base, void **args, int64_t *arg_sizes,
+                             int64_t *arg_types);
+void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
+                                    void **args_base, void **args,
+                                    int64_t *arg_sizes, int64_t *arg_types,
+                                    int32_t depNum, void *depList,
+                                    int32_t noAliasDepNum,
+                                    void *noAliasDepList);
+
+// passes data from the target, release target memory and destroys the
+// host-target mapping (top entry from the stack of data maps) created by
+// the last __tgt_target_data_begin
+void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base,
+                           void **args, int64_t *arg_sizes, int64_t *arg_types);
+void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+                                  void **args_base, void **args,
+                                  int64_t *arg_sizes, int64_t *arg_types,
+                                  int32_t depNum, void *depList,
+                                  int32_t noAliasDepNum, void *noAliasDepList);
+
+/// passes data to/from the target
+void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+                              void **args_base, void **args, int64_t *arg_sizes,
+                              int64_t *arg_types);
+void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num,
+                                     void **args_base, void **args,
+                                     int64_t *arg_sizes, int64_t *arg_types,
+                                     int32_t depNum, void *depList,
+                                     int32_t noAliasDepNum,
+                                     void *noAliasDepList);
+
+// Performs the same actions as data_begin in case arg_num is non-zero
+// and initiates run of offloaded region on target platform; if arg_num
+// is non-zero after the region execution is done it also performs the
+// same action as data_end above. The following types are used; this
+// function returns 0 if it was able to transfer the execution to a
+// target and an int different from zero otherwise.
+int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
+                 void **args_base, void **args, int64_t *arg_sizes,
+                 int64_t *arg_types);
+int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num,
+                        void **args_base, void **args, int64_t *arg_sizes,
+                        int64_t *arg_types, int32_t depNum, void *depList,
+                        int32_t noAliasDepNum, void *noAliasDepList);
+
+int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num,
+                       void **args_base, void **args, int64_t *arg_sizes,
+                       int64_t *arg_types, int32_t num_teams,
+                       int32_t thread_limit);
+int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
+                              int32_t arg_num, void **args_base, void **args,
+                              int64_t *arg_sizes, int64_t *arg_types,
+                              int32_t num_teams, int32_t thread_limit,
+                              int32_t depNum, void *depList,
+                              int32_t noAliasDepNum, void *noAliasDepList);
+void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef OMPTARGET_DEBUG
+#include <stdio.h>
+#define DEBUGP(prefix, ...)                                                    \
+  {                                                                            \
+    fprintf(stderr, "%s --> ", prefix);                                        \
+    fprintf(stderr, __VA_ARGS__);                                              \
+  }
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#define DPxMOD "0x%0*" PRIxPTR
+#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr))
+
+/*
+ * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x,
+ * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr));
+ *
+ * DPxMOD expands to:
+ *   "0x%0*" PRIxPTR
+ * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a
+ * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long:
+ *   "0x%0*lu"
+ *
+ * Ultimately, the whole statement expands to:
+ *   printf("ptr=0x%0*lu...\n",  // the 0* modifier expects an extra argument
+ *                               // specifying the width of the output
+ *   (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width
+ *                               // 8 digits for 32bit systems
+ *                               // 16 digits for 64bit
+ *   (uintptr_t) ptr);
+ */
+#else
+#define DEBUGP(prefix, ...)                                                    \
+  {}
+#endif
+
+#ifdef __cplusplus
+#define EXTERN extern "C"
+#else
+#define EXTERN extern
+#endif
+
+#endif // _OMPTARGET_H_
diff --git a/final/libomptarget/include/omptargetplugin.h b/final/libomptarget/include/omptargetplugin.h
new file mode 100644
index 0000000..e03416c
--- /dev/null
+++ b/final/libomptarget/include/omptargetplugin.h
@@ -0,0 +1,94 @@
+//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an interface between target independent OpenMP offload
+// runtime library libomptarget and target dependent plugin.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGETPLUGIN_H_
+#define _OMPTARGETPLUGIN_H_
+
+#include <omptarget.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Return the number of available devices of the type supported by the
+// target RTL.
+int32_t __tgt_rtl_number_of_devices(void);
+
+// Return an integer different from zero if the provided device image can be
+// supported by the runtime. The functionality is similar to comparing the
+// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
+// lightweight query to determine if the RTL is suitable for an image without
+// having to load the library, which can be expensive.
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
+
+// Initialize the requires flags for the device.
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags);
+
+// Initialize the specified device. In case of success return 0; otherwise
+// return an error code.
+int32_t __tgt_rtl_init_device(int32_t ID);
+
+// Pass an executable image section described by image to the specified
+// device and prepare an address table of target entities. In case of error,
+// return NULL. Otherwise, return a pointer to the built address table.
+// Individual entries in the table may also be NULL, when the corresponding
+// offload region is not supported on the target device.
+__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
+                                          __tgt_device_image *Image);
+
+// Allocate data on the particular target device, of the specified size.
+// HostPtr is a address of the host data the allocated target data
+// will be associated with (HostPtr may be NULL if it is not known at
+// allocation time, like for example it would be for target data that
+// is allocated by omp_target_alloc() API). Return address of the
+// allocated data on the target that will be used by libomptarget.so to
+// initialize the target data mapping structures. These addresses are
+// used to generate a table of target variables to pass to
+// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
+// case an error occurred on the target device.
+void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
+
+// Pass the data content to the target device using the target address.
+// In case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
+                              int64_t Size);
+
+// Retrieve the data content from the target device using its address.
+// In case of success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
+                                int64_t Size);
+
+// De-allocate the data referenced by target ptr on the device. In case of
+// success, return zero. Otherwise, return an error code.
+int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
+
+// Transfer control to the offloaded entry Entry on the target device.
+// Args and Offsets are arrays of NumArgs size of target addresses and
+// offsets. An offset should be added to the target address before passing it
+// to the outlined function on device side. In case of success, return zero.
+// Otherwise, return an error code.
+int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
+                                    ptrdiff_t *Offsets, int32_t NumArgs);
+
+// Similar to __tgt_rtl_run_target_region, but additionally specify the
+// number of teams to be created and a number of threads in each team.
+int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
+                                         ptrdiff_t *Offsets, int32_t NumArgs,
+                                         int32_t NumTeams, int32_t ThreadLimit,
+                                         uint64_t loop_tripcount);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _OMPTARGETPLUGIN_H_
diff --git a/final/libomptarget/plugins/CMakeLists.txt b/final/libomptarget/plugins/CMakeLists.txt
new file mode 100644
index 0000000..f8048ba
--- /dev/null
+++ b/final/libomptarget/plugins/CMakeLists.txt
@@ -0,0 +1,71 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build plugins for the user system if available.
+#
+##===----------------------------------------------------------------------===##
+
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# - build a plugin for an ELF based generic 64-bit target based on libffi.
+# - tmachine: name of the machine processor as used in the cmake build system.
+# - tmachine_name: name of the machine to be printed with the debug messages.
+# - tmachine_libname: machine name to be appended to the plugin library name.
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+  if(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+    
+      libomptarget_say("Building ${tmachine_name} offloading plugin.")
+    
+      include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+      include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR})
+      
+      # Define macro to be used as prefix of the runtime messages for this target.
+      add_definitions("-DTARGET_NAME=${tmachine_name}")
+      
+      # Define macro with the ELF ID for this target.
+      add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+    
+      add_library("omptarget.rtl.${tmachine_libname}" SHARED 
+        ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp)
+        
+      # Install plugin under the lib destination folder.
+      install(TARGETS "omptarget.rtl.${tmachine_libname}" 
+        LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+        
+      target_link_libraries(
+        "omptarget.rtl.${tmachine_libname}"
+        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} 
+        ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+        dl
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+    
+      # Report to the parent scope that we are building a plugin.
+      set(LIBOMPTARGET_SYSTEM_TARGETS 
+        "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+      
+    else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+      libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
+    endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+  else(LIBOMPTARGET_DEP_LIBELF_FOUND)
+    libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.")
+  endif(LIBOMPTARGET_DEP_LIBELF_FOUND)
+else()
+  libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
+endif()
+endmacro()
+
+add_subdirectory(aarch64)
+add_subdirectory(cuda)
+add_subdirectory(ppc64)
+add_subdirectory(ppc64le)
+add_subdirectory(x86_64)
+
+# Make sure the parent scope can see the plugins that will be created.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+
diff --git a/final/libomptarget/plugins/aarch64/CMakeLists.txt b/final/libomptarget/plugins/aarch64/CMakeLists.txt
new file mode 100644
index 0000000..350a56c
--- /dev/null
+++ b/final/libomptarget/plugins/aarch64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an aarch64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+else()
+ libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
+endif()
diff --git a/final/libomptarget/plugins/common/elf_common.c b/final/libomptarget/plugins/common/elf_common.c
new file mode 100644
index 0000000..60e1e4f
--- /dev/null
+++ b/final/libomptarget/plugins/common/elf_common.c
@@ -0,0 +1,72 @@
+//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common ELF functionality for target plugins.
+// Must be included in the plugin source file AFTER omptarget.h has been
+// included and macro DP(...) has been defined.
+// .
+//
+//===----------------------------------------------------------------------===//
+
+#if !(defined(_OMPTARGET_H_) && defined(DP))
+#error Include elf_common.c in the plugin source AFTER omptarget.h has been\
+ included and macro DP(...) has been defined.
+#endif
+
+#include <elf.h>
+#include <libelf.h>
+
+// Check whether an image is valid for execution on target_id
+static inline int32_t elf_check_machine(__tgt_device_image *image,
+    uint16_t target_id) {
+
+  // Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE) {
+    DP("Incompatible ELF library!\n");
+    return 0;
+  }
+
+  char *img_begin = (char *)image->ImageStart;
+  char *img_end = (char *)image->ImageEnd;
+  size_t img_size = img_end - img_begin;
+
+  // Obtain elf handler
+  Elf *e = elf_memory(img_begin, img_size);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return 0;
+  }
+
+  // Check if ELF is the right kind.
+  if (elf_kind(e) != ELF_K_ELF) {
+    DP("Unexpected ELF type!\n");
+    return 0;
+  }
+  Elf64_Ehdr *eh64 = elf64_getehdr(e);
+  Elf32_Ehdr *eh32 = elf32_getehdr(e);
+
+  if (!eh64 && !eh32) {
+    DP("Unable to get machine ID from ELF file!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  uint16_t MachineID;
+  if (eh64 && !eh32)
+    MachineID = eh64->e_machine;
+  else if (eh32 && !eh64)
+    MachineID = eh32->e_machine;
+  else {
+    DP("Ambiguous ELF header!\n");
+    elf_end(e);
+    return 0;
+  }
+
+  elf_end(e);
+  return MachineID == target_id;
+}
diff --git a/final/libomptarget/plugins/cuda/CMakeLists.txt b/final/libomptarget/plugins/cuda/CMakeLists.txt
new file mode 100644
index 0000000..5fab421
--- /dev/null
+++ b/final/libomptarget/plugins/cuda/CMakeLists.txt
@@ -0,0 +1,49 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a CUDA machine if available.
+#
+##===----------------------------------------------------------------------===##
+if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.")
+  return()
+elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
+  return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
+  return()
+elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
+  libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
+  return()
+endif()
+
+libomptarget_say("Building CUDA offloading plugin.")
+
+# Define the suffix for the runtime messaging dumps.
+add_definitions(-DTARGET_NAME=CUDA)
+
+if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+    add_definitions(-DCUDA_ERROR_REPORT)
+endif()
+
+include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
+
+add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+
+# Install plugin under the lib destination folder.
+install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+target_link_libraries(omptarget.rtl.cuda
+  ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+  ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+
+# Report to the parent scope that we are building a plugin for CUDA.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE)
diff --git a/final/libomptarget/plugins/cuda/src/rtl.cpp b/final/libomptarget/plugins/cuda/src/rtl.cpp
new file mode 100644
index 0000000..04a3ddc
--- /dev/null
+++ b/final/libomptarget/plugins/cuda/src/rtl.cpp
@@ -0,0 +1,794 @@
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <cuda.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME CUDA
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+// Utility for retrieving and printing CUDA error string.
+#ifdef CUDA_ERROR_REPORT
+#define CUDA_ERR_STRING(err)                                                   \
+  do {                                                                         \
+    const char *errStr;                                                        \
+    cuGetErrorString(err, &errStr);                                            \
+    DP("CUDA error is: %s\n", errStr);                                         \
+  } while (0)
+#else
+#define CUDA_ERR_STRING(err)                                                   \
+  {}
+#endif
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+  std::vector<__tgt_offload_entry> Entries;
+};
+
+enum ExecutionModeType {
+  SPMD, // constructors, destructors,
+        // combined constructs (`teams distribute parallel for [simd]`)
+  GENERIC, // everything else
+  NONE
+};
+
+/// Use a single entity to encode a kernel and a set of flags
+struct KernelTy {
+  CUfunction Func;
+
+  // execution mode of kernel
+  // 0 - SPMD mode (without master warp)
+  // 1 - Generic mode (with master warp)
+  int8_t ExecutionMode;
+
+  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
+      : Func(_Func), ExecutionMode(_ExecutionMode) {}
+};
+
+/// Device envrionment data
+/// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
+struct omptarget_device_environmentTy {
+  int32_t debug_level;
+};
+
+/// List that contains all the kernels.
+/// FIXME: we may need this to be per device and per library.
+std::list<KernelTy> KernelsList;
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+  int NumberOfDevices;
+  std::vector<CUmodule> Modules;
+  std::vector<CUcontext> Contexts;
+
+  // Device properties
+  std::vector<int> ThreadsPerBlock;
+  std::vector<int> BlocksPerGrid;
+  std::vector<int> WarpSize;
+
+  // OpenMP properties
+  std::vector<int> NumTeams;
+  std::vector<int> NumThreads;
+
+  // OpenMP Environment properties
+  int EnvNumTeams;
+  int EnvTeamLimit;
+
+  // OpenMP Requires Flags
+  int64_t RequiresFlags;
+
+  //static int EnvNumThreads;
+  static const int HardTeamLimit = 1<<16; // 64k
+  static const int HardThreadLimit = 1024;
+  static const int DefaultNumTeams = 128;
+  static const int DefaultNumThreads = 128;
+
+  // Record entry point associated with device
+  void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    E.Entries.push_back(entry);
+  }
+
+  // Return true if the entry is associated with device
+  bool findOffloadEntry(int32_t device_id, void *addr) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    for (auto &it : E.Entries) {
+      if (it.addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    int32_t size = E.Entries.size();
+
+    // Table is empty
+    if (!size)
+      return 0;
+
+    __tgt_offload_entry *begin = &E.Entries[0];
+    __tgt_offload_entry *end = &E.Entries[size - 1];
+
+    // Update table info according to the entries and return the pointer
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = ++end;
+
+    return &E.Table;
+  }
+
+  // Clear entries table for a device
+  void clearOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+    E.Entries.clear();
+    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
+  }
+
+  RTLDeviceInfoTy() {
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+      DebugLevel = std::stoi(envStr);
+    }
+#endif // OMPTARGET_DEBUG
+
+    DP("Start initializing CUDA\n");
+
+    CUresult err = cuInit(0);
+    if (err != CUDA_SUCCESS) {
+      DP("Error when initializing CUDA\n");
+      CUDA_ERR_STRING(err);
+      return;
+    }
+
+    NumberOfDevices = 0;
+
+    err = cuDeviceGetCount(&NumberOfDevices);
+    if (err != CUDA_SUCCESS) {
+      DP("Error when getting CUDA device count\n");
+      CUDA_ERR_STRING(err);
+      return;
+    }
+
+    if (NumberOfDevices == 0) {
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    FuncGblEntries.resize(NumberOfDevices);
+    Contexts.resize(NumberOfDevices);
+    ThreadsPerBlock.resize(NumberOfDevices);
+    BlocksPerGrid.resize(NumberOfDevices);
+    WarpSize.resize(NumberOfDevices);
+    NumTeams.resize(NumberOfDevices);
+    NumThreads.resize(NumberOfDevices);
+
+    // Get environment variables regarding teams
+    char *envStr = getenv("OMP_TEAM_LIMIT");
+    if (envStr) {
+      // OMP_TEAM_LIMIT has been set
+      EnvTeamLimit = std::stoi(envStr);
+      DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
+    } else {
+      EnvTeamLimit = -1;
+    }
+    envStr = getenv("OMP_NUM_TEAMS");
+    if (envStr) {
+      // OMP_NUM_TEAMS has been set
+      EnvNumTeams = std::stoi(envStr);
+      DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
+    } else {
+      EnvNumTeams = -1;
+    }
+
+    // Default state.
+    RequiresFlags = OMP_REQ_UNDEFINED;
+  }
+
+  ~RTLDeviceInfoTy() {
+    // Close modules
+    for (auto &module : Modules)
+      if (module) {
+        CUresult err = cuModuleUnload(module);
+        if (err != CUDA_SUCCESS) {
+          DP("Error when unloading CUDA module\n");
+          CUDA_ERR_STRING(err);
+        }
+      }
+
+    // Destroy contexts
+    for (auto &ctx : Contexts)
+      if (ctx) {
+        CUresult err = cuCtxDestroy(ctx);
+        if (err != CUDA_SUCCESS) {
+          DP("Error when destroying CUDA context\n");
+          CUDA_ERR_STRING(err);
+        }
+      }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+  return elf_check_machine(image, 190); // EM_CUDA = 190.
+}
+
+int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Init requires flags to %ld\n", RequiresFlags);
+  DeviceInfo.RequiresFlags = RequiresFlags;
+  return RequiresFlags;
+}
+
+int32_t __tgt_rtl_init_device(int32_t device_id) {
+
+  CUdevice cuDevice;
+  DP("Getting device %d\n", device_id);
+  CUresult err = cuDeviceGet(&cuDevice, device_id);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when getting CUDA device with id = %d\n", device_id);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // Create the context and save it to use whenever this device is selected.
+  err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
+                    cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when creating a CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // Query attributes to determine number of threads/block and blocks/grid.
+  int maxGridDimX;
+  err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting max grid dimension, use default\n");
+    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+  } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
+    DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
+    DP("Using %d CUDA blocks per grid\n", maxGridDimX);
+  } else {
+    DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+    DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+       "at the hard limit\n",
+       maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
+  }
+
+  // We are only exploiting threads along the x axis.
+  int maxBlockDimX;
+  err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting max block dimension, use default\n");
+    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+  } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
+    DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
+    DP("Using %d CUDA threads per block\n", maxBlockDimX);
+  } else {
+    DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
+    DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
+       "at the hard limit\n",
+       maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
+  }
+
+  int warpSize;
+  err =
+      cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
+  if (err != CUDA_SUCCESS) {
+    DP("Error getting warp size, assume default\n");
+    DeviceInfo.WarpSize[device_id] = 32;
+  } else {
+    DeviceInfo.WarpSize[device_id] = warpSize;
+  }
+
+  // Adjust teams to the env variables
+  if (DeviceInfo.EnvTeamLimit > 0 &&
+      DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
+    DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
+    DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
+        DeviceInfo.EnvTeamLimit);
+  }
+
+  DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
+     DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
+     DeviceInfo.WarpSize[device_id]);
+
+  // Set default number of teams
+  if (DeviceInfo.EnvNumTeams > 0) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
+    DP("Default number of teams set according to environment %d\n",
+        DeviceInfo.EnvNumTeams);
+  } else {
+    DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
+    DP("Default number of teams set according to library's default %d\n",
+        RTLDeviceInfoTy::DefaultNumTeams);
+  }
+  if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
+    DP("Default number of teams exceeds device limit, capping at %d\n",
+        DeviceInfo.BlocksPerGrid[device_id]);
+  }
+
+  // Set default number of threads
+  DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+  DP("Default number of threads set according to library's default %d\n",
+          RTLDeviceInfoTy::DefaultNumThreads);
+  if (DeviceInfo.NumThreads[device_id] >
+      DeviceInfo.ThreadsPerBlock[device_id]) {
+    DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
+    DP("Default number of threads exceeds device limit, capping at %d\n",
+        DeviceInfo.ThreadsPerBlock[device_id]);
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+    __tgt_device_image *image) {
+
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting a CUDA context for device %d\n", device_id);
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  // Clear the offload table as we are going to create a new one.
+  DeviceInfo.clearOffloadEntriesTable(device_id);
+
+  // Create the module and extract the function pointers.
+
+  CUmodule cumod;
+  DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
+  err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when loading CUDA module\n");
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  DP("CUDA module successfully loaded!\n");
+  DeviceInfo.Modules.push_back(cumod);
+
+  // Find the symbols in the module by name.
+  __tgt_offload_entry *HostBegin = image->EntriesBegin;
+  __tgt_offload_entry *HostEnd = image->EntriesEnd;
+
+  for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
+
+    if (!e->addr) {
+      // We return NULL when something like this happens, the host should have
+      // always something in the address to uniquely identify the target region.
+      DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
+
+      return NULL;
+    }
+
+    if (e->size) {
+      __tgt_offload_entry entry = *e;
+
+      CUdeviceptr cuptr;
+      size_t cusize;
+      err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
+
+      if (err != CUDA_SUCCESS) {
+        DP("Loading global '%s' (Failed)\n", e->name);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      if (cusize != e->size) {
+        DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
+            cusize, e->size);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
+          DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
+      entry.addr = (void *)cuptr;
+
+      // Note: In the current implementation declare target variables
+      // can either be link or to. This means that once unified
+      // memory is activated via the requires directive, the variable
+      // can be used directly from the host in both cases.
+      // TODO: when variables types other than to or link are added,
+      // the below condition should be changed to explicitely
+      // check for to and link variables types:
+      //  (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+      //   (e->flags & OMP_DECLARE_TARGET_LINK ||
+      //    e->flags == OMP_DECLARE_TARGET_TO))
+      if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+        // If unified memory is present any target link or to variables
+        // can access host addresses directly. There is no longer a
+        // need for device copies.
+        cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));
+        DP("Copy linked variable host address (" DPxMOD ")"
+           "to device address (" DPxMOD ")\n",
+          DPxPTR(*((void**)e->addr)), DPxPTR(cuptr));
+      }
+
+      DeviceInfo.addOffloadEntry(device_id, entry);
+
+      continue;
+    }
+
+    CUfunction fun;
+    err = cuModuleGetFunction(&fun, cumod, e->name);
+
+    if (err != CUDA_SUCCESS) {
+      DP("Loading '%s' (Failed)\n", e->name);
+      CUDA_ERR_STRING(err);
+      return NULL;
+    }
+
+    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
+        DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
+
+    // default value GENERIC (in case symbol is missing from cubin file)
+    int8_t ExecModeVal = ExecutionModeType::GENERIC;
+    std::string ExecModeNameStr (e->name);
+    ExecModeNameStr += "_exec_mode";
+    const char *ExecModeName = ExecModeNameStr.c_str();
+
+    CUdeviceptr ExecModePtr;
+    size_t cusize;
+    err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
+    if (err == CUDA_SUCCESS) {
+      if ((size_t)cusize != sizeof(int8_t)) {
+        DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
+           ExecModeName, cusize, sizeof(int8_t));
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
+      if (err != CUDA_SUCCESS) {
+        DP("Error when copying data from device to host. Pointers: "
+           "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
+           DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      if (ExecModeVal < 0 || ExecModeVal > 1) {
+        DP("Error wrong exec_mode value specified in cubin file: %d\n",
+           ExecModeVal);
+        return NULL;
+      }
+    } else {
+      DP("Loading global exec_mode '%s' - symbol missing, using default value "
+          "GENERIC (1)\n", ExecModeName);
+      CUDA_ERR_STRING(err);
+    }
+
+    KernelsList.push_back(KernelTy(fun, ExecModeVal));
+
+    __tgt_offload_entry entry = *e;
+    entry.addr = (void *)&KernelsList.back();
+    DeviceInfo.addOffloadEntry(device_id, entry);
+  }
+
+  // send device environment data to the device
+  {
+    omptarget_device_environmentTy device_env;
+
+    device_env.debug_level = 0;
+
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
+      device_env.debug_level = std::stoi(envStr);
+    }
+#endif
+
+    const char * device_env_Name="omptarget_device_environment";
+    CUdeviceptr device_env_Ptr;
+    size_t cusize;
+
+    err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
+
+    if (err == CUDA_SUCCESS) {
+      if ((size_t)cusize != sizeof(device_env)) {
+        DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
+            device_env_Name, cusize, sizeof(int32_t));
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
+      if (err != CUDA_SUCCESS) {
+        DP("Error when copying data from host to device. Pointers: "
+            "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
+            DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
+        CUDA_ERR_STRING(err);
+        return NULL;
+      }
+
+      DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
+    } else {
+      DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
+      DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
+    }
+  }
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+  if (size == 0) {
+    return NULL;
+  }
+
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error while trying to set CUDA current context\n");
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  CUdeviceptr ptr;
+  err = cuMemAlloc(&ptr, size);
+  if (err != CUDA_SUCCESS) {
+    DP("Error while trying to allocate %d\n", err);
+    CUDA_ERR_STRING(err);
+    return NULL;
+  }
+
+  void *vptr = (void *)ptr;
+  return vptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+    int64_t size) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when copying data from host to device. Pointers: host = " DPxMOD
+       ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+       DPxPTR(tgt_ptr), size);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+    int64_t size) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when copying data from device to host. Pointers: host = " DPxMOD
+        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
+        DPxPTR(tgt_ptr), size);
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  err = cuMemFree((CUdeviceptr)tgt_ptr);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when freeing CUDA memory\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount) {
+  // Set the context we are using.
+  CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
+  if (err != CUDA_SUCCESS) {
+    DP("Error when setting CUDA context\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  // All args are references.
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+  }
+
+  KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
+
+  int cudaThreadsPerBlock;
+
+  if (thread_limit > 0) {
+    cudaThreadsPerBlock = thread_limit;
+    DP("Setting CUDA threads per block to requested %d\n", thread_limit);
+    // Add master warp if necessary
+    if (KernelInfo->ExecutionMode == GENERIC) {
+      cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
+      DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
+    }
+  } else {
+    cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
+    DP("Setting CUDA threads per block to default %d\n",
+        DeviceInfo.NumThreads[device_id]);
+  }
+
+  if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
+    cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
+    DP("Threads per block capped at device limit %d\n",
+        DeviceInfo.ThreadsPerBlock[device_id]);
+  }
+
+  int kernel_limit;
+  err = cuFuncGetAttribute(&kernel_limit,
+      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
+  if (err == CUDA_SUCCESS) {
+    if (kernel_limit < cudaThreadsPerBlock) {
+      cudaThreadsPerBlock = kernel_limit;
+      DP("Threads per block capped at kernel limit %d\n", kernel_limit);
+    }
+  }
+
+  int cudaBlocksPerGrid;
+  if (team_num <= 0) {
+    if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
+      if (KernelInfo->ExecutionMode == SPMD) {
+        // We have a combined construct, i.e. `target teams distribute parallel
+        // for [simd]`. We launch so many teams so that each thread will
+        // execute one iteration of the loop.
+        // round up to the nearest integer
+        cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
+      } else {
+        // If we reach this point, then we have a non-combined construct, i.e.
+        // `teams distribute` with a nested `parallel for` and each team is
+        // assigned one iteration of the `distribute` loop. E.g.:
+        //
+        // #pragma omp target teams distribute
+        // for(...loop_tripcount...) {
+        //   #pragma omp parallel for
+        //   for(...) {}
+        // }
+        //
+        // Threads within a team will execute the iterations of the `parallel`
+        // loop.
+        cudaBlocksPerGrid = loop_tripcount;
+      }
+      DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
+          "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
+          cudaThreadsPerBlock);
+    } else {
+      cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
+      DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
+    }
+  } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
+    cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
+    DP("Capping number of teams to team limit %d\n",
+        DeviceInfo.BlocksPerGrid[device_id]);
+  } else {
+    cudaBlocksPerGrid = team_num;
+    DP("Using requested number of teams %d\n", team_num);
+  }
+
+  // Run on the device.
+  DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
+     cudaThreadsPerBlock);
+
+  err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
+      cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
+  if (err != CUDA_SUCCESS) {
+    DP("Device kernel launch failed!\n");
+    CUDA_ERR_STRING(err);
+    return OFFLOAD_FAIL;
+  }
+
+  DP("Launch of entry point at " DPxMOD " successful!\n",
+      DPxPTR(tgt_entry_ptr));
+
+  CUresult sync_err = cuCtxSynchronize();
+  if (sync_err != CUDA_SUCCESS) {
+    DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
+    CUDA_ERR_STRING(sync_err);
+    return OFFLOAD_FAIL;
+  } else {
+    DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+  // use one team and the default number of threads.
+  const int32_t team_num = 1;
+  const int32_t thread_limit = 0;
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+      tgt_offsets, arg_num, team_num, thread_limit, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/final/libomptarget/plugins/exports b/final/libomptarget/plugins/exports
new file mode 100644
index 0000000..a14bedf
--- /dev/null
+++ b/final/libomptarget/plugins/exports
@@ -0,0 +1,16 @@
+VERS1.0 {
+  global:
+    __tgt_rtl_is_valid_binary;
+    __tgt_rtl_number_of_devices;
+    __tgt_rtl_init_requires;
+    __tgt_rtl_init_device;
+    __tgt_rtl_load_binary;
+    __tgt_rtl_data_alloc;
+    __tgt_rtl_data_submit;
+    __tgt_rtl_data_retrieve;
+    __tgt_rtl_data_delete;
+    __tgt_rtl_run_target_team_region;
+    __tgt_rtl_run_target_region;
+  local:
+    *;
+};
diff --git a/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
new file mode 100644
index 0000000..4648e51
--- /dev/null
+++ b/final/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
@@ -0,0 +1,339 @@
+//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL for generic 64-bit machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <ffi.h>
+#include <gelf.h>
+#include <link.h>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "omptargetplugin.h"
+
+#ifndef TARGET_NAME
+#define TARGET_NAME Generic ELF - 64bit
+#endif
+
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+#ifdef OMPTARGET_DEBUG
+static int DebugLevel = 0;
+
+#define GETNAME2(name) #name
+#define GETNAME(name) GETNAME2(name)
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#include "../../common/elf_common.c"
+
+#define NUMBER_OF_DEVICES 4
+#define OFFLOADSECTIONNAME ".omp_offloading.entries"
+
+/// Array of Dynamic libraries loaded for this target.
+struct DynLibTy {
+  char *FileName;
+  void *Handle;
+};
+
+/// Keep entries table per device.
+struct FuncOrGblEntryTy {
+  __tgt_target_table Table;
+};
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
+
+public:
+  std::list<DynLibTy> DynLibs;
+
+  // Record entry point associated with device.
+  void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin,
+                          __tgt_offload_entry *end) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    E.Table.EntriesBegin = begin;
+    E.Table.EntriesEnd = end;
+  }
+
+  // Return true if the entry is associated with device.
+  bool findOffloadEntry(int32_t device_id, void *addr) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd;
+         i < e; ++i) {
+      if (i->addr == addr)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return the pointer to the target entries table.
+  __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
+    assert(device_id < (int32_t)FuncGblEntries.size() &&
+           "Unexpected device id!");
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
+
+    return &E.Table;
+  }
+
+  RTLDeviceInfoTy(int32_t num_devices) {
+#ifdef OMPTARGET_DEBUG
+    if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+      DebugLevel = std::stoi(envStr);
+    }
+#endif // OMPTARGET_DEBUG
+
+    FuncGblEntries.resize(num_devices);
+  }
+
+  ~RTLDeviceInfoTy() {
+    // Close dynamic libraries
+    for (auto &lib : DynLibs) {
+      if (lib.Handle) {
+        dlclose(lib.Handle);
+        remove(lib.FileName);
+      }
+    }
+  }
+};
+
+static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
+// If we don't have a valid ELF ID we can just fail.
+#if TARGET_ELF_ID < 1
+  return 0;
+#else
+  return elf_check_machine(image, TARGET_ELF_ID);
+#endif
+}
+
+int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
+
+int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; }
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
+                                          __tgt_device_image *image) {
+
+  DP("Dev %d: load binary from " DPxMOD " image\n", device_id,
+     DPxPTR(image->ImageStart));
+
+  assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id");
+
+  size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart;
+  size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin);
+  DP("Expecting to have %zd entries defined.\n", NumEntries);
+
+  // Is the library version incompatible with the header file?
+  if (elf_version(EV_CURRENT) == EV_NONE) {
+    DP("Incompatible ELF library!\n");
+    return NULL;
+  }
+
+  // Obtain elf handler
+  Elf *e = elf_memory((char *)image->ImageStart, ImageSize);
+  if (!e) {
+    DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
+    return NULL;
+  }
+
+  if (elf_kind(e) != ELF_K_ELF) {
+    DP("Invalid Elf kind!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  // Find the entries section offset
+  Elf_Scn *section = 0;
+  Elf64_Off entries_offset = 0;
+
+  size_t shstrndx;
+
+  if (elf_getshdrstrndx(e, &shstrndx)) {
+    DP("Unable to get ELF strings index!\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  while ((section = elf_nextscn(e, section))) {
+    GElf_Shdr hdr;
+    gelf_getshdr(section, &hdr);
+
+    if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) {
+      entries_offset = hdr.sh_addr;
+      break;
+    }
+  }
+
+  if (!entries_offset) {
+    DP("Entries Section Offset Not Found\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset));
+
+  // load dynamic library and get the entry points. We use the dl library
+  // to do the loading of the library, but we could do it directly to avoid the
+  // dump to the temporary file.
+  //
+  // 1) Create tmp file with the library contents.
+  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
+  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
+  int tmp_fd = mkstemp(tmp_name);
+
+  if (tmp_fd == -1) {
+    elf_end(e);
+    return NULL;
+  }
+
+  FILE *ftmp = fdopen(tmp_fd, "wb");
+
+  if (!ftmp) {
+    elf_end(e);
+    return NULL;
+  }
+
+  fwrite(image->ImageStart, ImageSize, 1, ftmp);
+  fclose(ftmp);
+
+  DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)};
+
+  if (!Lib.Handle) {
+    DP("Target library loading error: %s\n", dlerror());
+    elf_end(e);
+    return NULL;
+  }
+
+  DeviceInfo.DynLibs.push_back(Lib);
+
+  struct link_map *libInfo = (struct link_map *)Lib.Handle;
+
+  // The place where the entries info is loaded is the library base address
+  // plus the offset determined from the ELF file.
+  Elf64_Addr entries_addr = libInfo->l_addr + entries_offset;
+
+  DP("Pointer to first entry to be loaded is (" DPxMOD ").\n",
+      DPxPTR(entries_addr));
+
+  // Table of pointers to all the entries in the target.
+  __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr;
+
+  __tgt_offload_entry *entries_begin = &entries_table[0];
+  __tgt_offload_entry *entries_end = entries_begin + NumEntries;
+
+  if (!entries_begin) {
+    DP("Can't obtain entries begin\n");
+    elf_end(e);
+    return NULL;
+  }
+
+  DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n",
+      DPxPTR(entries_begin), DPxPTR(entries_end));
+  DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end);
+
+  elf_end(e);
+
+  return DeviceInfo.getOffloadEntriesTable(device_id);
+}
+
+void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
+  void *ptr = malloc(size);
+  return ptr;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+                              int64_t size) {
+  memcpy(tgt_ptr, hst_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+                                int64_t size) {
+  memcpy(hst_ptr, tgt_ptr, size);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
+  free(tgt_ptr);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
+    int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
+  // ignore team num and thread limit.
+
+  // Use libffi to launch execution.
+  ffi_cif cif;
+
+  // All args are references.
+  std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer);
+  std::vector<void *> args(arg_num);
+  std::vector<void *> ptrs(arg_num);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
+    args[i] = &ptrs[i];
+  }
+
+  ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num,
+                                   &ffi_type_void, &args_types[0]);
+
+  assert(status == FFI_OK && "Unable to prepare target launch!");
+
+  if (status != FFI_OK)
+    return OFFLOAD_FAIL;
+
+  DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr));
+
+  void (*entry)(void);
+  *((void**) &entry) = tgt_entry_ptr;
+  ffi_call(&cif, entry, NULL, &args[0]);
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
+    void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
+  // use one team and one thread.
+  return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
+      tgt_offsets, arg_num, 1, 1, 0);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/final/libomptarget/plugins/ppc64/CMakeLists.txt b/final/libomptarget/plugins/ppc64/CMakeLists.txt
new file mode 100644
index 0000000..3915196
--- /dev/null
+++ b/final/libomptarget/plugins/ppc64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
+endif()
\ No newline at end of file
diff --git a/final/libomptarget/plugins/ppc64le/CMakeLists.txt b/final/libomptarget/plugins/ppc64le/CMakeLists.txt
new file mode 100644
index 0000000..0cfe7c0
--- /dev/null
+++ b/final/libomptarget/plugins/ppc64le/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64le machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
+endif()
\ No newline at end of file
diff --git a/final/libomptarget/plugins/x86_64/CMakeLists.txt b/final/libomptarget/plugins/x86_64/CMakeLists.txt
new file mode 100644
index 0000000..f61e1e8
--- /dev/null
+++ b/final/libomptarget/plugins/x86_64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a x86_64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+else()
+ libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
+endif()
\ No newline at end of file
diff --git a/final/libomptarget/src/CMakeLists.txt b/final/libomptarget/src/CMakeLists.txt
new file mode 100644
index 0000000..f30087e
--- /dev/null
+++ b/final/libomptarget/src/CMakeLists.txt
@@ -0,0 +1,31 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build offloading library libomptarget.so.
+#
+##===----------------------------------------------------------------------===##
+
+libomptarget_say("Building offloading runtime library libomptarget.")
+
+set(src_files
+  api.cpp
+  device.cpp
+  interface.cpp
+  rtl.cpp
+  omptarget.cpp
+)
+
+# Build libomptarget library with libdl dependency.
+add_library(omptarget SHARED ${src_files})
+target_link_libraries(omptarget
+  ${CMAKE_DL_LIBS}
+  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
+
+# Install libomptarget under the lib destination folder.
+install(TARGETS omptarget LIBRARY COMPONENT omptarget
+  DESTINATION "${OPENMP_INSTALL_LIBDIR}")
diff --git a/final/libomptarget/src/api.cpp b/final/libomptarget/src/api.cpp
new file mode 100644
index 0000000..430425a
--- /dev/null
+++ b/final/libomptarget/src/api.cpp
@@ -0,0 +1,282 @@
+//===----------- api.cpp - Target independent OpenMP target RTL -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of OpenMP API interface functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <climits>
+#include <cstring>
+#include <cstdlib>
+
+EXTERN int omp_get_num_devices(void) {
+  RTLsMtx.lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx.unlock();
+
+  DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
+
+  return Devices_size;
+}
+
+EXTERN int omp_get_initial_device(void) {
+  DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
+  return HOST_DEVICE;
+}
+
+EXTERN void *omp_target_alloc(size_t size, int device_num) {
+  DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
+      device_num, size);
+
+  if (size <= 0) {
+    DP("Call to omp_target_alloc with non-positive length\n");
+    return NULL;
+  }
+
+  void *rc = NULL;
+
+  if (device_num == omp_get_initial_device()) {
+    rc = malloc(size);
+    DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
+    return rc;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_alloc returns NULL ptr\n");
+    return NULL;
+  }
+
+  DeviceTy &Device = Devices[device_num];
+  rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL);
+  DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
+  return rc;
+}
+
+EXTERN void omp_target_free(void *device_ptr, int device_num) {
+  DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
+      device_num, DPxPTR(device_ptr));
+
+  if (!device_ptr) {
+    DP("Call to omp_target_free with NULL ptr\n");
+    return;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    free(device_ptr);
+    DP("omp_target_free deallocated host ptr\n");
+    return;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_free returns, nothing to do\n");
+    return;
+  }
+
+  DeviceTy &Device = Devices[device_num];
+  Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
+  DP("omp_target_free deallocated device ptr\n");
+}
+
+EXTERN int omp_target_is_present(void *ptr, int device_num) {
+  DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
+      device_num, DPxPTR(ptr));
+
+  if (!ptr) {
+    DP("Call to omp_target_is_present with NULL ptr, returning false\n");
+    return false;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    DP("Call to omp_target_is_present on host, returning true\n");
+    return true;
+  }
+
+  RTLsMtx.lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx.unlock();
+  if (Devices_size <= (size_t)device_num) {
+    DP("Call to omp_target_is_present with invalid device ID, returning "
+        "false\n");
+    return false;
+  }
+
+  DeviceTy& Device = Devices[device_num];
+  bool IsLast; // not used
+  int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
+  DP("Call to omp_target_is_present returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
+    size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
+  DP("Call to omp_target_memcpy, dst device %d, src device %d, "
+      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
+      "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
+      DPxPTR(src), dst_offset, src_offset, length);
+
+  if (!dst || !src || length <= 0) {
+    DP("Call to omp_target_memcpy with invalid arguments\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
+      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
+      return OFFLOAD_FAIL;
+  }
+
+  if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
+      DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
+      return OFFLOAD_FAIL;
+  }
+
+  int rc = OFFLOAD_SUCCESS;
+  void *srcAddr = (char *)src + src_offset;
+  void *dstAddr = (char *)dst + dst_offset;
+
+  if (src_device == omp_get_initial_device() &&
+      dst_device == omp_get_initial_device()) {
+    DP("copy from host to host\n");
+    const void *p = memcpy(dstAddr, srcAddr, length);
+    if (p == NULL)
+      rc = OFFLOAD_FAIL;
+  } else if (src_device == omp_get_initial_device()) {
+    DP("copy from host to device\n");
+    DeviceTy& DstDev = Devices[dst_device];
+    rc = DstDev.data_submit(dstAddr, srcAddr, length);
+  } else if (dst_device == omp_get_initial_device()) {
+    DP("copy from device to host\n");
+    DeviceTy& SrcDev = Devices[src_device];
+    rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
+  } else {
+    DP("copy from device to device\n");
+    void *buffer = malloc(length);
+    DeviceTy& SrcDev = Devices[src_device];
+    DeviceTy& DstDev = Devices[dst_device];
+    rc = SrcDev.data_retrieve(buffer, srcAddr, length);
+    if (rc == OFFLOAD_SUCCESS)
+      rc = DstDev.data_submit(dstAddr, buffer, length);
+  }
+
+  DP("omp_target_memcpy returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
+    int num_dims, const size_t *volume, const size_t *dst_offsets,
+    const size_t *src_offsets, const size_t *dst_dimensions,
+    const size_t *src_dimensions, int dst_device, int src_device) {
+  DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
+      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
+      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
+      "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device,
+      src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets),
+      DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions),
+      DPxPTR(volume), element_size, num_dims);
+
+  if (!(dst || src)) {
+    DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
+        INT_MAX);
+    return INT_MAX;
+  }
+
+  if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
+      !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
+    DP("Call to omp_target_memcpy_rect with invalid arguments\n");
+    return OFFLOAD_FAIL;
+  }
+
+  int rc;
+  if (num_dims == 1) {
+    rc = omp_target_memcpy(dst, src, element_size * volume[0],
+        element_size * dst_offsets[0], element_size * src_offsets[0],
+        dst_device, src_device);
+  } else {
+    size_t dst_slice_size = element_size;
+    size_t src_slice_size = element_size;
+    for (int i=1; i<num_dims; ++i) {
+      dst_slice_size *= dst_dimensions[i];
+      src_slice_size *= src_dimensions[i];
+    }
+
+    size_t dst_off = dst_offsets[0] * dst_slice_size;
+    size_t src_off = src_offsets[0] * src_slice_size;
+    for (size_t i=0; i<volume[0]; ++i) {
+      rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i,
+          (char *) src + src_off + src_slice_size * i, element_size,
+          num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1,
+          dst_dimensions + 1, src_dimensions + 1, dst_device, src_device);
+
+      if (rc) {
+        DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
+        return rc;
+      }
+    }
+  }
+
+  DP("omp_target_memcpy_rect returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
+    size_t size, size_t device_offset, int device_num) {
+  DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
+      "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
+      DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
+
+  if (!host_ptr || !device_ptr || size <= 0) {
+    DP("Call to omp_target_associate_ptr with invalid arguments\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    DP("omp_target_associate_ptr: no association possible on the host\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
+    return OFFLOAD_FAIL;
+  }
+
+  DeviceTy& Device = Devices[device_num];
+  void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
+  int rc = Device.associatePtr(host_ptr, device_addr, size);
+  DP("omp_target_associate_ptr returns %d\n", rc);
+  return rc;
+}
+
+EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
+  DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
+      "device_num %d\n", DPxPTR(host_ptr), device_num);
+
+  if (!host_ptr) {
+    DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (device_num == omp_get_initial_device()) {
+    DP("omp_target_disassociate_ptr: no association possible on the host\n");
+    return OFFLOAD_FAIL;
+  }
+
+  if (!device_is_ready(device_num)) {
+    DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
+    return OFFLOAD_FAIL;
+  }
+
+  DeviceTy& Device = Devices[device_num];
+  int rc = Device.disassociatePtr(host_ptr);
+  DP("omp_target_disassociate_ptr returns %d\n", rc);
+  return rc;
+}
diff --git a/final/libomptarget/src/device.cpp b/final/libomptarget/src/device.cpp
new file mode 100644
index 0000000..5ecba57
--- /dev/null
+++ b/final/libomptarget/src/device.cpp
@@ -0,0 +1,367 @@
+//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functionality for managing devices that are handled by RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <climits>
+#include <string>
+
+/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
+DevicesTy Devices;
+
+int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
+  DataMapMtx.lock();
+
+  // Check if entry exists
+  for (auto &HT : HostDataToTargetMap) {
+    if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
+      // Mapping already exists
+      bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
+                     HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
+                     HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
+      DataMapMtx.unlock();
+      if (isValid) {
+        DP("Attempt to re-associate the same device ptr+offset with the same "
+            "host ptr, nothing to do\n");
+        return OFFLOAD_SUCCESS;
+      } else {
+        DP("Not allowed to re-associate a different device ptr+offset with the "
+            "same host ptr\n");
+        return OFFLOAD_FAIL;
+      }
+    }
+  }
+
+  // Mapping does not exist, allocate it
+  HostDataToTargetTy newEntry;
+
+  // Set up missing fields
+  newEntry.HstPtrBase = (uintptr_t) HstPtrBegin;
+  newEntry.HstPtrBegin = (uintptr_t) HstPtrBegin;
+  newEntry.HstPtrEnd = (uintptr_t) HstPtrBegin + Size;
+  newEntry.TgtPtrBegin = (uintptr_t) TgtPtrBegin;
+  // refCount must be infinite
+  newEntry.RefCount = INF_REF_CNT;
+
+  DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
+      DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
+      DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
+      DPxPTR(newEntry.TgtPtrBegin));
+  HostDataToTargetMap.push_front(newEntry);
+
+  DataMapMtx.unlock();
+
+  return OFFLOAD_SUCCESS;
+}
+
+int DeviceTy::disassociatePtr(void *HstPtrBegin) {
+  DataMapMtx.lock();
+
+  // Check if entry exists
+  for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
+      ii != HostDataToTargetMap.end(); ++ii) {
+    if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
+      // Mapping exists
+      if (CONSIDERED_INF(ii->RefCount)) {
+        DP("Association found, removing it\n");
+        HostDataToTargetMap.erase(ii);
+        DataMapMtx.unlock();
+        return OFFLOAD_SUCCESS;
+      } else {
+        DP("Trying to disassociate a pointer which was not mapped via "
+            "omp_target_associate_ptr\n");
+        break;
+      }
+    }
+  }
+
+  // Mapping not found
+  DataMapMtx.unlock();
+  DP("Association not found\n");
+  return OFFLOAD_FAIL;
+}
+
+// Get ref count of map entry containing HstPtrBegin
+long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
+  uintptr_t hp = (uintptr_t)HstPtrBegin;
+  long RefCnt = -1;
+
+  DataMapMtx.lock();
+  for (auto &HT : HostDataToTargetMap) {
+    if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
+      DP("DeviceTy::getMapEntry: requested entry found\n");
+      RefCnt = HT.RefCount;
+      break;
+    }
+  }
+  DataMapMtx.unlock();
+
+  if (RefCnt < 0) {
+    DP("DeviceTy::getMapEntry: requested entry not found\n");
+  }
+
+  return RefCnt;
+}
+
+LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
+  uintptr_t hp = (uintptr_t)HstPtrBegin;
+  LookupResult lr;
+
+  DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
+      Size);
+  for (lr.Entry = HostDataToTargetMap.begin();
+      lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
+    auto &HT = *lr.Entry;
+    // Is it contained?
+    lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
+        (hp+Size) <= HT.HstPtrEnd;
+    // Does it extend into an already mapped region?
+    lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
+    // Does it extend beyond the mapped region?
+    lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
+
+    if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
+        lr.Flags.ExtendsAfter) {
+      break;
+    }
+  }
+
+  if (lr.Flags.ExtendsBefore) {
+    DP("WARNING: Pointer is not mapped but section extends into already "
+        "mapped data\n");
+  }
+  if (lr.Flags.ExtendsAfter) {
+    DP("WARNING: Pointer is already mapped but section extends beyond mapped "
+        "region\n");
+  }
+
+  return lr;
+}
+
+// Used by target_data_begin
+// Return the target pointer begin (where the data will be moved).
+// Allocate memory if this is the first occurrence of this mapping.
+// Increment the reference counter.
+// If NULL is returned, then either data allocation failed or the user tried
+// to do an illegal mapping.
+void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
+    int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
+  void *rc = NULL;
+  DataMapMtx.lock();
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+
+  // Check if the pointer is contained.
+  if (lr.Flags.IsContained ||
+      ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
+    auto &HT = *lr.Entry;
+    IsNew = false;
+
+    if (UpdateRefCount)
+      ++HT.RefCount;
+
+    uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
+    DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
+        "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
+        DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
+        (UpdateRefCount ? " updated" : ""),
+        (CONSIDERED_INF(HT.RefCount)) ? "INF" :
+            std::to_string(HT.RefCount).c_str());
+    rc = (void *)tp;
+  } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
+    // Explicit extension of mapped data - not allowed.
+    DP("Explicit extension of mapping is not allowed.\n");
+  } else if (Size) {
+    // If it is not contained and Size > 0 we should create a new entry for it.
+    IsNew = true;
+    uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
+    DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
+        "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
+        DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
+    HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
+        (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
+    rc = (void *)tp;
+  }
+
+  DataMapMtx.unlock();
+  return rc;
+}
+
+// Used by target_data_begin, target_data_end, target_data_update and target.
+// Return the target pointer begin (where the data will be moved).
+// Decrement the reference counter if called from target_data_end.
+void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+    bool UpdateRefCount) {
+  void *rc = NULL;
+  DataMapMtx.lock();
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+
+  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+    auto &HT = *lr.Entry;
+    IsLast = !(HT.RefCount > 1);
+
+    if (HT.RefCount > 1 && UpdateRefCount)
+      --HT.RefCount;
+
+    uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
+    DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
+        "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
+        (UpdateRefCount ? " updated" : ""),
+        (CONSIDERED_INF(HT.RefCount)) ? "INF" :
+            std::to_string(HT.RefCount).c_str());
+    rc = (void *)tp;
+  } else {
+    IsLast = false;
+  }
+
+  DataMapMtx.unlock();
+  return rc;
+}
+
+// Return the target pointer begin (where the data will be moved).
+// Lock-free version called when loading global symbols from the fat binary.
+void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
+  uintptr_t hp = (uintptr_t)HstPtrBegin;
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+    auto &HT = *lr.Entry;
+    uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
+    return (void *)tp;
+  }
+
+  return NULL;
+}
+
+int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
+  // Check if the pointer is contained in any sub-nodes.
+  int rc;
+  DataMapMtx.lock();
+  LookupResult lr = lookupMapping(HstPtrBegin, Size);
+  if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
+    auto &HT = *lr.Entry;
+    if (ForceDelete)
+      HT.RefCount = 1;
+    if (--HT.RefCount <= 0) {
+      assert(HT.RefCount == 0 && "did not expect a negative ref count");
+      DP("Deleting tgt data " DPxMOD " of size %ld\n",
+          DPxPTR(HT.TgtPtrBegin), Size);
+      RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
+      DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
+          ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
+          DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
+      HostDataToTargetMap.erase(lr.Entry);
+    }
+    rc = OFFLOAD_SUCCESS;
+  } else {
+    DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
+       " memory\n", DPxPTR(HstPtrBegin));
+    rc = OFFLOAD_FAIL;
+  }
+
+  DataMapMtx.unlock();
+  return rc;
+}
+
+/// Init device, should not be called directly.
+void DeviceTy::init() {
+  // Make call to init_requires if it exists for this plugin.
+  if (RTL->init_requires)
+    RTL->init_requires(RTLRequiresFlags);
+  int32_t rc = RTL->init_device(RTLDeviceID);
+  if (rc == OFFLOAD_SUCCESS) {
+    IsInit = true;
+  }
+}
+
+/// Thread-safe method to initialize the device only once.
+int32_t DeviceTy::initOnce() {
+  std::call_once(InitFlag, &DeviceTy::init, this);
+
+  // At this point, if IsInit is true, then either this thread or some other
+  // thread in the past successfully initialized the device, so we can return
+  // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
+  // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
+  // that some other thread already attempted to execute init() and if IsInit
+  // is still false, return OFFLOAD_FAIL.
+  if (IsInit)
+    return OFFLOAD_SUCCESS;
+  else
+    return OFFLOAD_FAIL;
+}
+
+// Load binary to device.
+__tgt_target_table *DeviceTy::load_binary(void *Img) {
+  RTL->Mtx.lock();
+  __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
+  RTL->Mtx.unlock();
+  return rc;
+}
+
+// Submit data to device.
+int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
+    int64_t Size) {
+  return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
+}
+
+// Retrieve data from device.
+int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
+    int64_t Size) {
+  return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
+}
+
+// Run region on device
+int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
+    ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) {
+  return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
+      TgtVarsSize);
+}
+
+// Run team region on device.
+int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
+    ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
+    int32_t ThreadLimit, uint64_t LoopTripCount) {
+  return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
+      TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount);
+}
+
+/// Check whether a device has an associated RTL and initialize it if it's not
+/// already initialized.
+bool device_is_ready(int device_num) {
+  DP("Checking whether device %d is ready.\n", device_num);
+  // Devices.size() can only change while registering a new
+  // library, so try to acquire the lock of RTLs' mutex.
+  RTLsMtx.lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx.unlock();
+  if (Devices_size <= (size_t)device_num) {
+    DP("Device ID  %d does not have a matching RTL\n", device_num);
+    return false;
+  }
+
+  // Get device info
+  DeviceTy &Device = Devices[device_num];
+
+  DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
+       Device.RTLDeviceID, Device.IsInit);
+
+  // Init the device if not done before
+  if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
+    DP("Failed to init device %d\n", device_num);
+    return false;
+  }
+
+  DP("Device %d is ready to use.\n", device_num);
+
+  return true;
+}
diff --git a/final/libomptarget/src/device.h b/final/libomptarget/src/device.h
new file mode 100644
index 0000000..ded84e3
--- /dev/null
+++ b/final/libomptarget/src/device.h
@@ -0,0 +1,172 @@
+//===----------- device.h - Target independent OpenMP target RTL ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for managing devices that are handled by RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_DEVICE_H
+#define _OMPTARGET_DEVICE_H
+
+#include <cstddef>
+#include <climits>
+#include <list>
+#include <map>
+#include <mutex>
+#include <vector>
+
+// Forward declarations.
+struct RTLInfoTy;
+struct __tgt_bin_desc;
+struct __tgt_target_table;
+
+#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions
+#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1))
+
+/// Map between host data and target data.
+struct HostDataToTargetTy {
+  uintptr_t HstPtrBase; // host info.
+  uintptr_t HstPtrBegin;
+  uintptr_t HstPtrEnd; // non-inclusive.
+
+  uintptr_t TgtPtrBegin; // target info.
+
+  long RefCount;
+
+  HostDataToTargetTy()
+      : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0),
+        TgtPtrBegin(0), RefCount(0) {}
+  HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB)
+      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
+        TgtPtrBegin(TB), RefCount(1) {}
+  HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB,
+      long RF)
+      : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
+        TgtPtrBegin(TB), RefCount(RF) {}
+};
+
+typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
+
+struct LookupResult {
+  struct {
+    unsigned IsContained   : 1;
+    unsigned ExtendsBefore : 1;
+    unsigned ExtendsAfter  : 1;
+  } Flags;
+
+  HostDataToTargetListTy::iterator Entry;
+
+  LookupResult() : Flags({0,0,0}), Entry() {}
+};
+
+/// Map for shadow pointers
+struct ShadowPtrValTy {
+  void *HstPtrVal;
+  void *TgtPtrAddr;
+  void *TgtPtrVal;
+};
+typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
+
+///
+struct PendingCtorDtorListsTy {
+  std::list<void *> PendingCtors;
+  std::list<void *> PendingDtors;
+};
+typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
+    PendingCtorsDtorsPerLibrary;
+
+struct DeviceTy {
+  int32_t DeviceID;
+  RTLInfoTy *RTL;
+  int32_t RTLDeviceID;
+
+  bool IsInit;
+  std::once_flag InitFlag;
+  bool HasPendingGlobals;
+
+  HostDataToTargetListTy HostDataToTargetMap;
+  PendingCtorsDtorsPerLibrary PendingCtorsDtors;
+
+  ShadowPtrListTy ShadowPtrMap;
+
+  std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx;
+
+  // NOTE: Once libomp gains full target-task support, this state should be
+  // moved into the target task in libomp.
+  std::map<int32_t, uint64_t> LoopTripCnt;
+
+  int64_t RTLRequiresFlags;
+
+  DeviceTy(RTLInfoTy *RTL)
+      : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
+        HasPendingGlobals(false), HostDataToTargetMap(),
+        PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(),
+        ShadowMtx(), RTLRequiresFlags(0) {}
+
+  // The existence of mutexes makes DeviceTy non-copyable. We need to
+  // provide a copy constructor and an assignment operator explicitly.
+  DeviceTy(const DeviceTy &d)
+      : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
+        IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
+        HostDataToTargetMap(d.HostDataToTargetMap),
+        PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
+        DataMapMtx(), PendingGlobalsMtx(),
+        ShadowMtx(), LoopTripCnt(d.LoopTripCnt),
+        RTLRequiresFlags(d.RTLRequiresFlags) {}
+
+  DeviceTy& operator=(const DeviceTy &d) {
+    DeviceID = d.DeviceID;
+    RTL = d.RTL;
+    RTLDeviceID = d.RTLDeviceID;
+    IsInit = d.IsInit;
+    HasPendingGlobals = d.HasPendingGlobals;
+    HostDataToTargetMap = d.HostDataToTargetMap;
+    PendingCtorsDtors = d.PendingCtorsDtors;
+    ShadowPtrMap = d.ShadowPtrMap;
+    LoopTripCnt = d.LoopTripCnt;
+    RTLRequiresFlags = d.RTLRequiresFlags;
+
+    return *this;
+  }
+
+  long getMapEntryRefCnt(void *HstPtrBegin);
+  LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
+  void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
+      bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
+  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
+  void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
+      bool UpdateRefCount);
+  int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
+  int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
+  int disassociatePtr(void *HstPtrBegin);
+
+  // calls to RTL
+  int32_t initOnce();
+  __tgt_target_table *load_binary(void *Img);
+
+  int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
+  int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
+
+  int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
+      ptrdiff_t *TgtOffsets, int32_t TgtVarsSize);
+  int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
+      ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
+      int32_t ThreadLimit, uint64_t LoopTripCount);
+
+private:
+  // Call to RTL
+  void init(); // To be called only via DeviceTy::initOnce()
+};
+
+/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
+typedef std::vector<DeviceTy> DevicesTy;
+extern DevicesTy Devices;
+
+extern bool device_is_ready(int device_num);
+
+#endif
diff --git a/final/libomptarget/src/exports b/final/libomptarget/src/exports
new file mode 100644
index 0000000..f13414e
--- /dev/null
+++ b/final/libomptarget/src/exports
@@ -0,0 +1,29 @@
+VERS1.0 {
+  global:
+    __tgt_register_requires;
+    __tgt_register_lib;
+    __tgt_unregister_lib;
+    __tgt_target_data_begin;
+    __tgt_target_data_end;
+    __tgt_target_data_update;
+    __tgt_target;
+    __tgt_target_teams;
+    __tgt_target_data_begin_nowait;
+    __tgt_target_data_end_nowait;
+    __tgt_target_data_update_nowait;
+    __tgt_target_nowait;
+    __tgt_target_teams_nowait;
+    omp_get_num_devices;
+    omp_get_initial_device;
+    omp_target_alloc;
+    omp_target_free;
+    omp_target_is_present;
+    omp_target_memcpy;
+    omp_target_memcpy_rect;
+    omp_target_associate_ptr;
+    omp_target_disassociate_ptr;
+    __kmpc_push_target_tripcount;
+  local:
+    *;
+};
+
diff --git a/final/libomptarget/src/interface.cpp b/final/libomptarget/src/interface.cpp
new file mode 100644
index 0000000..32afe3f
--- /dev/null
+++ b/final/libomptarget/src/interface.cpp
@@ -0,0 +1,325 @@
+//===-------- interface.cpp - Target independent OpenMP target RTL --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <mutex>
+
+// Store target policy (disabled, mandatory, default)
+kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default;
+std::mutex TargetOffloadMtx;
+
+////////////////////////////////////////////////////////////////////////////////
+/// manage the success or failure of a target constuct
+
+static void HandleDefaultTargetOffload() {
+  TargetOffloadMtx.lock();
+  if (TargetOffloadPolicy == tgt_default) {
+    if (omp_get_num_devices() > 0) {
+      DP("Default TARGET OFFLOAD policy is now mandatory "
+         "(devices were found)\n");
+      TargetOffloadPolicy = tgt_mandatory;
+    } else {
+      DP("Default TARGET OFFLOAD policy is now disabled "
+         "(no devices were found)\n");
+      TargetOffloadPolicy = tgt_disabled;
+    }
+  }
+  TargetOffloadMtx.unlock();
+}
+
+static int IsOffloadDisabled() {
+  if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload();
+  return TargetOffloadPolicy == tgt_disabled;
+}
+
+static void HandleTargetOutcome(bool success) {
+  switch (TargetOffloadPolicy) {
+    case tgt_disabled:
+      if (success) {
+        FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
+      }
+      break;
+    case tgt_default:
+      FATAL_MESSAGE0(1, "default offloading policy must be switched to "
+                        "mandatory or disabled");
+      break;
+    case tgt_mandatory:
+      if (!success) {
+        FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory");
+      }
+      break;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// adds requires flags
+EXTERN void __tgt_register_requires(int64_t flags) {
+  RTLs.RegisterRequires(flags);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// adds a target shared library to the target execution image
+EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
+  RTLs.RegisterLib(desc);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// unloads a target shared library
+EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
+  RTLs.UnregisterLib(desc);
+}
+
+/// creates host-to-target data mapping, stores it in the
+/// libomptarget.so internal structure (an entry in a stack of data maps)
+/// and passes the data to the device.
+EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return;
+
+  DP("Entering data begin region for device %" PRId64 " with %d mappings\n",
+      device_id, arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+    DP("Use default device id %" PRId64 "\n", device_id);
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DeviceTy& Device = Devices[device_id];
+
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target_data_begin(Device, arg_num, args_base,
+      args, arg_sizes, arg_types);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+}
+
+EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
+                          arg_types);
+}
+
+/// passes data from the target, releases target memory and destroys
+/// the host-target mapping (top entry from the stack of data maps)
+/// created by the last __tgt_target_data_begin.
+EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return;
+  DP("Entering data end region with %d mappings\n", arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  RTLsMtx.lock();
+  size_t Devices_size = Devices.size();
+  RTLsMtx.unlock();
+  if (Devices_size <= (size_t)device_id) {
+    DP("Device ID  %" PRId64 " does not have a matching RTL.\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DeviceTy &Device = Devices[device_id];
+  if (!Device.IsInit) {
+    DP("Uninit device: ignore");
+    HandleTargetOutcome(false);
+    return;
+  }
+
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target_data_end(Device, arg_num, args_base,
+      args, arg_sizes, arg_types);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+}
+
+EXTERN void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
+                        arg_types);
+}
+
+EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return;
+  DP("Entering data update with %d mappings\n", arg_num);
+
+  // No devices available?
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DeviceTy& Device = Devices[device_id];
+  int rc = target_data_update(Device, arg_num, args_base,
+      args, arg_sizes, arg_types);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+}
+
+EXTERN void __tgt_target_data_update_nowait(
+    int64_t device_id, int32_t arg_num, void **args_base, void **args,
+    int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList,
+    int32_t noAliasDepNum, void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
+                           arg_types);
+}
+
+EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  if (IsOffloadDisabled()) return OFFLOAD_FAIL;
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
+
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return OFFLOAD_FAIL;
+  }
+
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, 0, 0, false /*team*/);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+  return rc;
+}
+
+EXTERN int __tgt_target_nowait(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
+    void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+                      arg_types);
+}
+
+EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t team_num, int32_t thread_limit) {
+  if (IsOffloadDisabled()) return OFFLOAD_FAIL;
+  DP("Entering target region with entry point " DPxMOD " and device Id %"
+      PRId64 "\n", DPxPTR(host_ptr), device_id);
+
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return OFFLOAD_FAIL;
+  }
+
+#ifdef OMPTARGET_DEBUG
+  for (int i=0; i<arg_num; ++i) {
+    DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
+        ", Type=0x%" PRIx64 "\n", i, DPxPTR(args_base[i]), DPxPTR(args[i]),
+        arg_sizes[i], arg_types[i]);
+  }
+#endif
+
+  int rc = target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
+      arg_types, team_num, thread_limit, true /*team*/);
+  HandleTargetOutcome(rc == OFFLOAD_SUCCESS);
+
+  return rc;
+}
+
+EXTERN int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr,
+    int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
+    int64_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
+    void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
+  if (depNum + noAliasDepNum > 0)
+    __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL));
+
+  return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
+                            arg_sizes, arg_types, team_num, thread_limit);
+}
+
+EXTERN void __kmpc_push_target_tripcount(int64_t device_id,
+    uint64_t loop_tripcount) {
+  if (device_id == OFFLOAD_DEVICE_DEFAULT) {
+    device_id = omp_get_default_device();
+  }
+
+  if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) {
+    DP("Failed to get device %" PRId64 " ready\n", device_id);
+    HandleTargetOutcome(false);
+    return;
+  }
+
+  DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id,
+      loop_tripcount);
+  TblMapMtx.lock();
+  Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL),
+                                         loop_tripcount);
+  TblMapMtx.unlock();
+}
diff --git a/final/libomptarget/src/omptarget.cpp b/final/libomptarget/src/omptarget.cpp
new file mode 100644
index 0000000..c41bf31
--- /dev/null
+++ b/final/libomptarget/src/omptarget.cpp
@@ -0,0 +1,777 @@
+//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the interface to be used by Clang during the codegen of a
+// target region.
+//
+//===----------------------------------------------------------------------===//
+
+#include <omptarget.h>
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <vector>
+
+#ifdef OMPTARGET_DEBUG
+int DebugLevel = 0;
+#endif // OMPTARGET_DEBUG
+
+
+
+/* All begin addresses for partially mapped structs must be 8-aligned in order
+ * to ensure proper alignment of members. E.g.
+ *
+ * struct S {
+ *   int a;   // 4-aligned
+ *   int b;   // 4-aligned
+ *   int *p;  // 8-aligned
+ * } s1;
+ * ...
+ * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
+ * {
+ *   s1.b = 5;
+ *   for (int i...) s1.p[i] = ...;
+ * }
+ *
+ * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
+ * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
+ * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
+ * requirements for its type. Now, when we allocate memory on the device, in
+ * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
+ * This means that the chunk of the struct on the device will start at a
+ * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
+ * address of p will be a misaligned 0x204 (on the host there was no need to add
+ * padding between b and p, so p comes exactly 4 bytes after b). If the device
+ * kernel tries to access s1.p, a misaligned address error occurs (as reported
+ * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
+ * extending the size of the allocated chuck accordingly, the chuck on the
+ * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
+ * &s1.p=0x208, as they should be to satisfy the alignment requirements.
+ */
+static const int64_t alignment = 8;
+
+/// Map global data and execute pending ctors
+static int InitLibrary(DeviceTy& Device) {
+  /*
+   * Map global data
+   */
+  int32_t device_id = Device.DeviceID;
+  int rc = OFFLOAD_SUCCESS;
+
+  Device.PendingGlobalsMtx.lock();
+  TrlTblMtx.lock();
+  for (HostEntriesBeginToTransTableTy::iterator
+      ii = HostEntriesBeginToTransTable.begin();
+      ii != HostEntriesBeginToTransTable.end(); ++ii) {
+    TranslationTable *TransTable = &ii->second;
+    if (TransTable->TargetsTable[device_id] != 0) {
+      // Library entries have already been processed
+      continue;
+    }
+
+    // 1) get image.
+    assert(TransTable->TargetsImages.size() > (size_t)device_id &&
+           "Not expecting a device ID outside the table's bounds!");
+    __tgt_device_image *img = TransTable->TargetsImages[device_id];
+    if (!img) {
+      DP("No image loaded for device id %d.\n", device_id);
+      rc = OFFLOAD_FAIL;
+      break;
+    }
+    // 2) load image into the target table.
+    __tgt_target_table *TargetTable =
+        TransTable->TargetsTable[device_id] = Device.load_binary(img);
+    // Unable to get table for this image: invalidate image and fail.
+    if (!TargetTable) {
+      DP("Unable to generate entries table for device id %d.\n", device_id);
+      TransTable->TargetsImages[device_id] = 0;
+      rc = OFFLOAD_FAIL;
+      break;
+    }
+
+    // Verify whether the two table sizes match.
+    size_t hsize =
+        TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
+    size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
+
+    // Invalid image for these host entries!
+    if (hsize != tsize) {
+      DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
+         device_id, hsize, tsize);
+      TransTable->TargetsImages[device_id] = 0;
+      TransTable->TargetsTable[device_id] = 0;
+      rc = OFFLOAD_FAIL;
+      break;
+    }
+
+    // process global data that needs to be mapped.
+    Device.DataMapMtx.lock();
+    __tgt_target_table *HostTable = &TransTable->HostTable;
+    for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
+                             *CurrHostEntry = HostTable->EntriesBegin,
+                             *EntryDeviceEnd = TargetTable->EntriesEnd;
+         CurrDeviceEntry != EntryDeviceEnd;
+         CurrDeviceEntry++, CurrHostEntry++) {
+      if (CurrDeviceEntry->size != 0) {
+        // has data.
+        assert(CurrDeviceEntry->size == CurrHostEntry->size &&
+               "data size mismatch");
+
+        // Fortran may use multiple weak declarations for the same symbol,
+        // therefore we must allow for multiple weak symbols to be loaded from
+        // the fat binary. Treat these mappings as any other "regular" mapping.
+        // Add entry to map.
+        if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
+          continue;
+        DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
+            "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
+            CurrDeviceEntry->size);
+        Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
+            (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
+            (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
+            (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
+            (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
+            INF_REF_CNT /*RefCount*/));
+      }
+    }
+    Device.DataMapMtx.unlock();
+  }
+  TrlTblMtx.unlock();
+
+  if (rc != OFFLOAD_SUCCESS) {
+    Device.PendingGlobalsMtx.unlock();
+    return rc;
+  }
+
+  /*
+   * Run ctors for static objects
+   */
+  if (!Device.PendingCtorsDtors.empty()) {
+    // Call all ctors for all libraries registered so far
+    for (auto &lib : Device.PendingCtorsDtors) {
+      if (!lib.second.PendingCtors.empty()) {
+        DP("Has pending ctors... call now\n");
+        for (auto &entry : lib.second.PendingCtors) {
+          void *ctor = entry;
+          int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
+                          NULL, 1, 1, true /*team*/);
+          if (rc != OFFLOAD_SUCCESS) {
+            DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
+            Device.PendingGlobalsMtx.unlock();
+            return OFFLOAD_FAIL;
+          }
+        }
+        // Clear the list to indicate that this device has been used
+        lib.second.PendingCtors.clear();
+        DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
+      }
+    }
+  }
+  Device.HasPendingGlobals = false;
+  Device.PendingGlobalsMtx.unlock();
+
+  return OFFLOAD_SUCCESS;
+}
+
+// Check whether a device has been initialized, global ctors have been
+// executed and global data has been mapped; do so if not already done.
+int CheckDeviceAndCtors(int64_t device_id) {
+  // Is device ready?
+  if (!device_is_ready(device_id)) {
+    DP("Device %" PRId64 " is not ready.\n", device_id);
+    return OFFLOAD_FAIL;
+  }
+
+  // Get device info.
+  DeviceTy &Device = Devices[device_id];
+
+  // Check whether global data has been mapped for this device
+  Device.PendingGlobalsMtx.lock();
+  bool hasPendingGlobals = Device.HasPendingGlobals;
+  Device.PendingGlobalsMtx.unlock();
+  if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
+    DP("Failed to init globals on device %" PRId64 "\n", device_id);
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+static int32_t member_of(int64_t type) {
+  return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
+}
+
+/// Internal function to do the mapping and transfer the data to the device
+int target_data_begin(DeviceTy &Device, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  // process each input.
+  for (int32_t i = 0; i < arg_num; ++i) {
+    // Ignore private variables and arrays - there is no mapping for them.
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+      continue;
+
+    void *HstPtrBegin = args[i];
+    void *HstPtrBase = args_base[i];
+    int64_t data_size = arg_sizes[i];
+
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
+    // Address of pointer on the host and device, respectively.
+    void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
+    bool IsNew, Pointer_IsNew;
+    bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
+    // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
+    // have reached this point via __tgt_target_data_begin and not __tgt_target
+    // then no argument is marked as TARGET_PARAM ("omp target data map" is not
+    // associated with a target region, so there are no target parameters). This
+    // may be considered a hack, we could revise the scheme in the future.
+    bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
+    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+      DP("Has a pointer entry: \n");
+      // base is address of pointer.
+      Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
+          sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
+      if (!Pointer_TgtPtrBegin) {
+        DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+            "illegal mapping).\n");
+        return OFFLOAD_FAIL;
+      }
+      DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
+          "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
+          (Pointer_IsNew ? "" : " not"));
+      Pointer_HstPtrBegin = HstPtrBase;
+      // modify current entry.
+      HstPtrBase = *(void **)HstPtrBase;
+      UpdateRef = true; // subsequently update ref count of pointee
+    }
+
+    void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
+        data_size, IsNew, IsImplicit, UpdateRef);
+    if (!TgtPtrBegin && data_size) {
+      // If data_size==0, then the argument could be a zero-length pointer to
+      // NULL, so getOrAlloc() returning NULL is not an error.
+      DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
+          "illegal mapping).\n");
+    }
+    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+        " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
+        (IsNew ? "" : " not"));
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
+      uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
+      void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
+      DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
+      args_base[i] = TgtPtrBase;
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+      bool copy = false;
+      if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
+        copy = true;
+      } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
+        // Copy data only if the "parent" struct has RefCount==1.
+        int32_t parent_idx = member_of(arg_types[i]);
+        long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+        assert(parent_rc > 0 && "parent struct not found");
+        if (parent_rc == 1) {
+          copy = true;
+        }
+      }
+
+      if (copy) {
+        DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+      DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
+          DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
+      uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
+      void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
+      int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
+          sizeof(void *));
+      if (rt != OFFLOAD_SUCCESS) {
+        DP("Copying data to device failed.\n");
+        return OFFLOAD_FAIL;
+      }
+      // create shadow pointers for this entry
+      Device.ShadowMtx.lock();
+      Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
+          Pointer_TgtPtrBegin, TgtPtrBase};
+      Device.ShadowMtx.unlock();
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+/// Internal function to undo the mapping and retrieve the data from the device.
+int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  // process each input.
+  for (int32_t i = arg_num - 1; i >= 0; --i) {
+    // Ignore private variables and arrays - there is no mapping for them.
+    // Also, ignore the use_device_ptr directive, it has no effect here.
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+      continue;
+
+    void *HstPtrBegin = args[i];
+    int64_t data_size = arg_sizes[i];
+    // Adjust for proper alignment if this is a combined entry (for structs).
+    // Look at the next argument - if that is MEMBER_OF this one, then this one
+    // is a combined entry.
+    int64_t padding = 0;
+    const int next_i = i+1;
+    if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
+        member_of(arg_types[next_i]) == i) {
+      padding = (int64_t)HstPtrBegin % alignment;
+      if (padding) {
+        DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
+            "\n", padding, DPxPTR(HstPtrBegin));
+        HstPtrBegin = (char *) HstPtrBegin - padding;
+        data_size += padding;
+      }
+    }
+
+    bool IsLast;
+    bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
+    bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
+
+    // If PTR_AND_OBJ, HstPtrBegin is address of pointee
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
+        UpdateRef);
+    DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
+        " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
+        (IsLast ? "" : " not"));
+
+    bool DelEntry = IsLast || ForceDelete;
+
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+        !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+      DelEntry = false; // protect parent struct from being deallocated
+    }
+
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
+      // Move data back to the host
+      if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+        bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
+        bool CopyMember = false;
+        if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+            !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+          // Copy data only if the "parent" struct has RefCount==1.
+          int32_t parent_idx = member_of(arg_types[i]);
+          long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+          assert(parent_rc > 0 && "parent struct not found");
+          if (parent_rc == 1) {
+            CopyMember = true;
+          }
+        }
+
+        if (DelEntry || Always || CopyMember) {
+          DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+              data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+          int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
+          if (rt != OFFLOAD_SUCCESS) {
+            DP("Copying data from device failed.\n");
+            return OFFLOAD_FAIL;
+          }
+        }
+      }
+
+      // If we copied back to the host a struct/array containing pointers, we
+      // need to restore the original host pointer values from their shadow
+      // copies. If the struct is going to be deallocated, remove any remaining
+      // shadow pointer entries for this struct.
+      uintptr_t lb = (uintptr_t) HstPtrBegin;
+      uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
+      Device.ShadowMtx.lock();
+      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+           it != Device.ShadowPtrMap.end();) {
+        void **ShadowHstPtrAddr = (void**) it->first;
+
+        // An STL map is sorted on its keys; use this property
+        // to quickly determine when to break out of the loop.
+        if ((uintptr_t) ShadowHstPtrAddr < lb) {
+          ++it;
+          continue;
+        }
+        if ((uintptr_t) ShadowHstPtrAddr >= ub)
+          break;
+
+        // If we copied the struct to the host, we need to restore the pointer.
+        if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+          DP("Restoring original host pointer value " DPxMOD " for host "
+              "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+              DPxPTR(ShadowHstPtrAddr));
+          *ShadowHstPtrAddr = it->second.HstPtrVal;
+        }
+        // If the struct is to be deallocated, remove the shadow entry.
+        if (DelEntry) {
+          DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
+          it = Device.ShadowPtrMap.erase(it);
+        } else {
+          ++it;
+        }
+      }
+      Device.ShadowMtx.unlock();
+
+      // Deallocate map
+      if (DelEntry) {
+        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Deallocating data from device failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+    }
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+/// Internal function to pass data to/from the target.
+int target_data_update(DeviceTy &Device, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
+  // process each input.
+  for (int32_t i = 0; i < arg_num; ++i) {
+    if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
+        (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
+      continue;
+
+    void *HstPtrBegin = args[i];
+    int64_t MapSize = arg_sizes[i];
+    bool IsLast;
+    void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
+        false);
+    if (!TgtPtrBegin) {
+      DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
+      continue;
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
+      DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
+          arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
+      int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
+      if (rt != OFFLOAD_SUCCESS) {
+        DP("Copying data from device failed.\n");
+        return OFFLOAD_FAIL;
+      }
+
+      uintptr_t lb = (uintptr_t) HstPtrBegin;
+      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+      Device.ShadowMtx.lock();
+      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+          it != Device.ShadowPtrMap.end(); ++it) {
+        void **ShadowHstPtrAddr = (void**) it->first;
+        if ((uintptr_t) ShadowHstPtrAddr < lb)
+          continue;
+        if ((uintptr_t) ShadowHstPtrAddr >= ub)
+          break;
+        DP("Restoring original host pointer value " DPxMOD " for host pointer "
+            DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
+            DPxPTR(ShadowHstPtrAddr));
+        *ShadowHstPtrAddr = it->second.HstPtrVal;
+      }
+      Device.ShadowMtx.unlock();
+    }
+
+    if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+      DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
+          arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
+      int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
+      if (rt != OFFLOAD_SUCCESS) {
+        DP("Copying data to device failed.\n");
+        return OFFLOAD_FAIL;
+      }
+      uintptr_t lb = (uintptr_t) HstPtrBegin;
+      uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
+      Device.ShadowMtx.lock();
+      for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
+          it != Device.ShadowPtrMap.end(); ++it) {
+        void **ShadowHstPtrAddr = (void**) it->first;
+        if ((uintptr_t) ShadowHstPtrAddr < lb)
+          continue;
+        if ((uintptr_t) ShadowHstPtrAddr >= ub)
+          break;
+        DP("Restoring original target pointer value " DPxMOD " for target "
+            "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
+            DPxPTR(it->second.TgtPtrAddr));
+        rt = Device.data_submit(it->second.TgtPtrAddr,
+            &it->second.TgtPtrVal, sizeof(void *));
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed.\n");
+          Device.ShadowMtx.unlock();
+          return OFFLOAD_FAIL;
+        }
+      }
+      Device.ShadowMtx.unlock();
+    }
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
+                                      OMP_TGT_MAPTYPE_LITERAL |
+                                      OMP_TGT_MAPTYPE_IMPLICIT;
+static bool isLambdaMapping(int64_t Mapping) {
+  return (Mapping & LambdaMapping) == LambdaMapping;
+}
+
+/// performs the same actions as data_begin in case arg_num is
+/// non-zero and initiates run of the offloaded region on the target platform;
+/// if arg_num is non-zero after the region execution is done it also
+/// performs the same action as data_update and data_end above. This function
+/// returns 0 if it was able to transfer the execution to a target and an
+/// integer different from zero otherwise.
+int target(int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
+  DeviceTy &Device = Devices[device_id];
+
+  // Find the table information in the map or look it up in the translation
+  // tables.
+  TableMap *TM = 0;
+  TblMapMtx.lock();
+  HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
+  if (TableMapIt == HostPtrToTableMap.end()) {
+    // We don't have a map. So search all the registered libraries.
+    TrlTblMtx.lock();
+    for (HostEntriesBeginToTransTableTy::iterator
+             ii = HostEntriesBeginToTransTable.begin(),
+             ie = HostEntriesBeginToTransTable.end();
+         !TM && ii != ie; ++ii) {
+      // get the translation table (which contains all the good info).
+      TranslationTable *TransTable = &ii->second;
+      // iterate over all the host table entries to see if we can locate the
+      // host_ptr.
+      __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
+      __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
+      __tgt_offload_entry *cur = begin;
+      for (uint32_t i = 0; cur < end; ++cur, ++i) {
+        if (cur->addr != host_ptr)
+          continue;
+        // we got a match, now fill the HostPtrToTableMap so that we
+        // may avoid this search next time.
+        TM = &HostPtrToTableMap[host_ptr];
+        TM->Table = TransTable;
+        TM->Index = i;
+        break;
+      }
+    }
+    TrlTblMtx.unlock();
+  } else {
+    TM = &TableMapIt->second;
+  }
+  TblMapMtx.unlock();
+
+  // No map for this host pointer found!
+  if (!TM) {
+    DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
+       DPxPTR(host_ptr));
+    return OFFLOAD_FAIL;
+  }
+
+  // get target table.
+  TrlTblMtx.lock();
+  assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
+         "Not expecting a device ID outside the table's bounds!");
+  __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
+  TrlTblMtx.unlock();
+  assert(TargetTable && "Global data has not been mapped\n");
+
+  // Move data to device.
+  int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
+      arg_types);
+  if (rc != OFFLOAD_SUCCESS) {
+    DP("Call to target_data_begin failed, abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  std::vector<void *> tgt_args;
+  std::vector<ptrdiff_t> tgt_offsets;
+
+  // List of (first-)private arrays allocated for this target region
+  std::vector<void *> fpArrays;
+  std::vector<int> tgtArgsPositions(arg_num, -1);
+
+  for (int32_t i = 0; i < arg_num; ++i) {
+    if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
+      // This is not a target parameter, do not push it into tgt_args.
+      // Check for lambda mapping.
+      if (isLambdaMapping(arg_types[i])) {
+        assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+               "PTR_AND_OBJ must be also MEMBER_OF.");
+        unsigned idx = member_of(arg_types[i]);
+        int tgtIdx = tgtArgsPositions[idx];
+        assert(tgtIdx != -1 && "Base address must be translated already.");
+        // The parent lambda must be processed already and it must be the last
+        // in tgt_args and tgt_offsets arrays.
+        void *HstPtrVal = args[i];
+        void *HstPtrBegin = args_base[i];
+        void *HstPtrBase = args[idx];
+        bool IsLast; // unused.
+        void *TgtPtrBase =
+            (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
+        DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
+        uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
+        void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
+        void *Pointer_TgtPtrBegin =
+            Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false);
+        if (!Pointer_TgtPtrBegin) {
+          DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
+             DPxPTR(HstPtrVal));
+          continue;
+        }
+        DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
+           DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
+        int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
+                                    sizeof(void *));
+        if (rt != OFFLOAD_SUCCESS) {
+          DP("Copying data to device failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+      continue;
+    }
+    void *HstPtrBegin = args[i];
+    void *HstPtrBase = args_base[i];
+    void *TgtPtrBegin;
+    ptrdiff_t TgtBaseOffset;
+    bool IsLast; // unused.
+    if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
+      DP("Forwarding first-private value " DPxMOD " to the target construct\n",
+          DPxPTR(HstPtrBase));
+      TgtPtrBegin = HstPtrBase;
+      TgtBaseOffset = 0;
+    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
+      // Allocate memory for (first-)private array
+      TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
+          arg_sizes[i], HstPtrBegin);
+      if (!TgtPtrBegin) {
+        DP ("Data allocation for %sprivate array " DPxMOD " failed, "
+            "abort target.\n",
+            (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+            DPxPTR(HstPtrBegin));
+        return OFFLOAD_FAIL;
+      }
+      fpArrays.push_back(TgtPtrBegin);
+      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+      DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
+          "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
+          arg_sizes[i], DPxPTR(TgtPtrBegin),
+          (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
+          DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
+#endif
+      // If first-private, copy data from host
+      if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
+        int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
+        if (rt != OFFLOAD_SUCCESS) {
+          DP ("Copying data to device failed, failed.\n");
+          return OFFLOAD_FAIL;
+        }
+      }
+    } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
+          false);
+      TgtBaseOffset = 0; // no offset for ptrs.
+      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
+         "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
+         DPxPTR(HstPtrBase));
+    } else {
+      TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
+          false);
+      TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
+#ifdef OMPTARGET_DEBUG
+      void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
+      DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
+          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
+#endif
+    }
+    tgtArgsPositions[i] = tgt_args.size();
+    tgt_args.push_back(TgtPtrBegin);
+    tgt_offsets.push_back(TgtBaseOffset);
+  }
+
+  assert(tgt_args.size() == tgt_offsets.size() &&
+      "Size mismatch in arguments and offsets");
+
+  // Pop loop trip count
+  uint64_t ltc = 0;
+  TblMapMtx.lock();
+  auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
+  if (I != Device.LoopTripCnt.end()) {
+    ltc = I->second;
+    Device.LoopTripCnt.erase(I);
+    DP("loop trip count is %lu.\n", ltc);
+  }
+  TblMapMtx.unlock();
+
+  // Launch device execution.
+  DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
+      TargetTable->EntriesBegin[TM->Index].name,
+      DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
+  if (IsTeamConstruct) {
+    rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
+        &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
+        thread_limit, ltc);
+  } else {
+    rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
+        &tgt_args[0], &tgt_offsets[0], tgt_args.size());
+  }
+  if (rc != OFFLOAD_SUCCESS) {
+    DP ("Executing target region abort target.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  // Deallocate (first-)private arrays
+  for (auto it : fpArrays) {
+    int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
+    if (rt != OFFLOAD_SUCCESS) {
+      DP("Deallocation of (first-)private arrays failed.\n");
+      return OFFLOAD_FAIL;
+    }
+  }
+
+  // Move data from device.
+  int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
+      arg_types);
+  if (rt != OFFLOAD_SUCCESS) {
+    DP("Call to target_data_end failed, abort targe.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
diff --git a/final/libomptarget/src/private.h b/final/libomptarget/src/private.h
new file mode 100644
index 0000000..b406909
--- /dev/null
+++ b/final/libomptarget/src/private.h
@@ -0,0 +1,87 @@
+//===---------- private.h - Target independent OpenMP target RTL ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Private function declarations and helper macros for debugging output.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_PRIVATE_H
+#define _OMPTARGET_PRIVATE_H
+
+#include <omptarget.h>
+
+#include <cstdint>
+
+extern int target_data_begin(DeviceTy &Device, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+
+extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
+    void **args, int64_t *arg_sizes, int64_t *arg_types);
+
+extern int target_data_update(DeviceTy &Device, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
+
+extern int target(int64_t device_id, void *host_ptr, int32_t arg_num,
+    void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
+    int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
+
+extern int CheckDeviceAndCtors(int64_t device_id);
+
+// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition
+enum kmp_target_offload_kind {
+  tgt_disabled = 0,
+  tgt_default = 1,
+  tgt_mandatory = 2
+};
+typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
+extern kmp_target_offload_kind_t TargetOffloadPolicy;
+
+////////////////////////////////////////////////////////////////////////////////
+// implemtation for fatal messages
+////////////////////////////////////////////////////////////////////////////////
+
+#define FATAL_MESSAGE0(_num, _str)                                    \
+  do {                                                                \
+    fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \
+    exit(1);                                                          \
+  } while (0)
+
+#define FATAL_MESSAGE(_num, _str, ...)                              \
+  do {                                                              \
+    fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \
+            __VA_ARGS__);                                           \
+    exit(1);                                                        \
+  } while (0)
+
+// Implemented in libomp, they are called from within __tgt_* functions.
+#ifdef __cplusplus
+extern "C" {
+#endif
+// functions that extract info from libomp; keep in sync
+int omp_get_default_device(void) __attribute__((weak));
+int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak));
+int32_t __kmpc_global_thread_num(void *) __attribute__((weak));
+int __kmpc_get_target_offload(void) __attribute__((weak));
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef OMPTARGET_DEBUG
+extern int DebugLevel;
+
+#define DP(...) \
+  do { \
+    if (DebugLevel > 0) { \
+      DEBUGP("Libomptarget", __VA_ARGS__); \
+    } \
+  } while (false)
+#else // OMPTARGET_DEBUG
+#define DP(...) {}
+#endif // OMPTARGET_DEBUG
+
+#endif
diff --git a/final/libomptarget/src/rtl.cpp b/final/libomptarget/src/rtl.cpp
new file mode 100644
index 0000000..4eb7ab7
--- /dev/null
+++ b/final/libomptarget/src/rtl.cpp
@@ -0,0 +1,408 @@
+//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Functionality for handling RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#include "device.h"
+#include "private.h"
+#include "rtl.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <mutex>
+#include <string>
+
+// List of all plugins that can support offloading.
+static const char *RTLNames[] = {
+    /* PowerPC target */ "libomptarget.rtl.ppc64.so",
+    /* x86_64 target  */ "libomptarget.rtl.x86_64.so",
+    /* CUDA target    */ "libomptarget.rtl.cuda.so",
+    /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
+
+RTLsTy RTLs;
+std::mutex RTLsMtx;
+
+HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
+std::mutex TrlTblMtx;
+
+HostPtrToTableMapTy HostPtrToTableMap;
+std::mutex TblMapMtx;
+
+void RTLsTy::LoadRTLs() {
+#ifdef OMPTARGET_DEBUG
+  if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
+    DebugLevel = std::stoi(envStr);
+  }
+#endif // OMPTARGET_DEBUG
+
+  // Parse environment variable OMP_TARGET_OFFLOAD (if set)
+  TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload();
+  if (TargetOffloadPolicy == tgt_disabled) {
+    return;
+  }
+
+  DP("Loading RTLs...\n");
+
+  // Attempt to open all the plugins and, if they exist, check if the interface
+  // is correct and if they are supporting any devices.
+  for (auto *Name : RTLNames) {
+    DP("Loading library '%s'...\n", Name);
+    void *dynlib_handle = dlopen(Name, RTLD_NOW);
+
+    if (!dynlib_handle) {
+      // Library does not exist or cannot be found.
+      DP("Unable to load library '%s': %s!\n", Name, dlerror());
+      continue;
+    }
+
+    DP("Successfully loaded library '%s'!\n", Name);
+
+    // Retrieve the RTL information from the runtime library.
+    RTLInfoTy R;
+
+    R.LibraryHandler = dynlib_handle;
+    R.isUsed = false;
+
+#ifdef OMPTARGET_DEBUG
+    R.RTLName = Name;
+#endif
+
+    if (!(*((void**) &R.is_valid_binary) = dlsym(
+              dynlib_handle, "__tgt_rtl_is_valid_binary")))
+      continue;
+    if (!(*((void**) &R.number_of_devices) = dlsym(
+              dynlib_handle, "__tgt_rtl_number_of_devices")))
+      continue;
+    if (!(*((void**) &R.init_device) = dlsym(
+              dynlib_handle, "__tgt_rtl_init_device")))
+      continue;
+    if (!(*((void**) &R.load_binary) = dlsym(
+              dynlib_handle, "__tgt_rtl_load_binary")))
+      continue;
+    if (!(*((void**) &R.data_alloc) = dlsym(
+              dynlib_handle, "__tgt_rtl_data_alloc")))
+      continue;
+    if (!(*((void**) &R.data_submit) = dlsym(
+              dynlib_handle, "__tgt_rtl_data_submit")))
+      continue;
+    if (!(*((void**) &R.data_retrieve) = dlsym(
+              dynlib_handle, "__tgt_rtl_data_retrieve")))
+      continue;
+    if (!(*((void**) &R.data_delete) = dlsym(
+              dynlib_handle, "__tgt_rtl_data_delete")))
+      continue;
+    if (!(*((void**) &R.run_region) = dlsym(
+              dynlib_handle, "__tgt_rtl_run_target_region")))
+      continue;
+    if (!(*((void**) &R.run_team_region) = dlsym(
+              dynlib_handle, "__tgt_rtl_run_target_team_region")))
+      continue;
+
+    // Optional functions
+    *((void**) &R.init_requires) = dlsym(
+        dynlib_handle, "__tgt_rtl_init_requires");
+
+    // No devices are supported by this RTL?
+    if (!(R.NumberOfDevices = R.number_of_devices())) {
+      DP("No devices supported in this RTL\n");
+      continue;
+    }
+
+    DP("Registering RTL %s supporting %d devices!\n",
+        R.RTLName.c_str(), R.NumberOfDevices);
+
+    // The RTL is valid! Will save the information in the RTLs list.
+    AllRTLs.push_back(R);
+  }
+
+  DP("RTLs loaded!\n");
+
+  return;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Functionality for registering libs
+
+static void RegisterImageIntoTranslationTable(TranslationTable &TT,
+    RTLInfoTy &RTL, __tgt_device_image *image) {
+
+  // same size, as when we increase one, we also increase the other.
+  assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
+         "We should have as many images as we have tables!");
+
+  // Resize the Targets Table and Images to accommodate the new targets if
+  // required
+  unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
+
+  if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
+    TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
+    TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
+  }
+
+  // Register the image in all devices for this target type.
+  for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) {
+    // If we are changing the image we are also invalidating the target table.
+    if (TT.TargetsImages[RTL.Idx + i] != image) {
+      TT.TargetsImages[RTL.Idx + i] = image;
+      TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table.
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Functionality for registering Ctors/Dtors
+
+static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
+    __tgt_device_image *img, RTLInfoTy *RTL) {
+
+  for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
+    DeviceTy &Device = Devices[RTL->Idx + i];
+    Device.PendingGlobalsMtx.lock();
+    Device.HasPendingGlobals = true;
+    for (__tgt_offload_entry *entry = img->EntriesBegin;
+        entry != img->EntriesEnd; ++entry) {
+      if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
+        DP("Adding ctor " DPxMOD " to the pending list.\n",
+            DPxPTR(entry->addr));
+        Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
+      } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
+        // Dtors are pushed in reverse order so they are executed from end
+        // to beginning when unregistering the library!
+        DP("Adding dtor " DPxMOD " to the pending list.\n",
+            DPxPTR(entry->addr));
+        Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
+      }
+
+      if (entry->flags & OMP_DECLARE_TARGET_LINK) {
+        DP("The \"link\" attribute is not yet supported!\n");
+      }
+    }
+    Device.PendingGlobalsMtx.unlock();
+  }
+}
+
+void RTLsTy::RegisterRequires(int64_t flags) {
+  // TODO: add more elaborate check.
+  // Minimal check: only set requires flags if previous value
+  // is undefined. This ensures that only the first call to this
+  // function will set the requires flags. All subsequent calls
+  // will be checked for compatibility.
+  assert(flags != OMP_REQ_UNDEFINED &&
+         "illegal undefined flag for requires directive!");
+  if (RequiresFlags == OMP_REQ_UNDEFINED) {
+    RequiresFlags = flags;
+    return;
+  }
+
+  // If multiple compilation units are present enforce
+  // consistency across all of them for require clauses:
+  //  - reverse_offload
+  //  - unified_address
+  //  - unified_shared_memory
+  if ((RequiresFlags & OMP_REQ_REVERSE_OFFLOAD) !=
+      (flags & OMP_REQ_REVERSE_OFFLOAD)) {
+    FATAL_MESSAGE0(1,
+        "'#pragma omp requires reverse_offload' not used consistently!");
+  }
+  if ((RequiresFlags & OMP_REQ_UNIFIED_ADDRESS) !=
+          (flags & OMP_REQ_UNIFIED_ADDRESS)) {
+    FATAL_MESSAGE0(1,
+        "'#pragma omp requires unified_address' not used consistently!");
+  }
+  if ((RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) !=
+          (flags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+    FATAL_MESSAGE0(1,
+        "'#pragma omp requires unified_shared_memory' not used consistently!");
+  }
+
+  // TODO: insert any other missing checks
+
+  DP("New requires flags %ld compatible with existing %ld!\n",
+     flags, RequiresFlags);
+}
+
+void RTLsTy::RegisterLib(__tgt_bin_desc *desc) {
+  // Attempt to load all plugins available in the system.
+  std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
+
+  RTLsMtx.lock();
+  // Register the images with the RTLs that understand them, if any.
+  for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+    // Obtain the image.
+    __tgt_device_image *img = &desc->DeviceImages[i];
+
+    RTLInfoTy *FoundRTL = NULL;
+
+    // Scan the RTLs that have associated images until we find one that supports
+    // the current image.
+    for (auto &R : RTLs.AllRTLs) {
+      if (!R.is_valid_binary(img)) {
+        DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
+            DPxPTR(img->ImageStart), R.RTLName.c_str());
+        continue;
+      }
+
+      DP("Image " DPxMOD " is compatible with RTL %s!\n",
+          DPxPTR(img->ImageStart), R.RTLName.c_str());
+
+      // If this RTL is not already in use, initialize it.
+      if (!R.isUsed) {
+        // Initialize the device information for the RTL we are about to use.
+        DeviceTy device(&R);
+        size_t start = Devices.size();
+        Devices.resize(start + R.NumberOfDevices, device);
+        for (int32_t device_id = 0; device_id < R.NumberOfDevices;
+            device_id++) {
+          // global device ID
+          Devices[start + device_id].DeviceID = start + device_id;
+          // RTL local device ID
+          Devices[start + device_id].RTLDeviceID = device_id;
+          // RTL requires flags
+          Devices[start + device_id].RTLRequiresFlags = RequiresFlags;
+        }
+
+        // Initialize the index of this RTL and save it in the used RTLs.
+        R.Idx = (RTLs.UsedRTLs.empty())
+                    ? 0
+                    : RTLs.UsedRTLs.back()->Idx +
+                          RTLs.UsedRTLs.back()->NumberOfDevices;
+        assert((size_t) R.Idx == start &&
+            "RTL index should equal the number of devices used so far.");
+        R.isUsed = true;
+        RTLs.UsedRTLs.push_back(&R);
+
+        DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
+      }
+
+      // Initialize (if necessary) translation table for this library.
+      TrlTblMtx.lock();
+      if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){
+        TranslationTable &tt =
+            HostEntriesBeginToTransTable[desc->HostEntriesBegin];
+        tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
+        tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
+      }
+
+      // Retrieve translation table for this library.
+      TranslationTable &TransTable =
+          HostEntriesBeginToTransTable[desc->HostEntriesBegin];
+
+      DP("Registering image " DPxMOD " with RTL %s!\n",
+          DPxPTR(img->ImageStart), R.RTLName.c_str());
+      RegisterImageIntoTranslationTable(TransTable, R, img);
+      TrlTblMtx.unlock();
+      FoundRTL = &R;
+
+      // Load ctors/dtors for static objects
+      RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
+
+      // if an RTL was found we are done - proceed to register the next image
+      break;
+    }
+
+    if (!FoundRTL) {
+      DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
+    }
+  }
+  RTLsMtx.unlock();
+
+
+  DP("Done registering entries!\n");
+}
+
+void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
+  DP("Unloading target library!\n");
+
+  RTLsMtx.lock();
+  // Find which RTL understands each image, if any.
+  for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
+    // Obtain the image.
+    __tgt_device_image *img = &desc->DeviceImages[i];
+
+    RTLInfoTy *FoundRTL = NULL;
+
+    // Scan the RTLs that have associated images until we find one that supports
+    // the current image. We only need to scan RTLs that are already being used.
+    for (auto *R : RTLs.UsedRTLs) {
+
+      assert(R->isUsed && "Expecting used RTLs.");
+
+      if (!R->is_valid_binary(img)) {
+        DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
+            DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+        continue;
+      }
+
+      DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n",
+          DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+
+      FoundRTL = R;
+
+      // Execute dtors for static objects if the device has been used, i.e.
+      // if its PendingCtors list has been emptied.
+      for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
+        DeviceTy &Device = Devices[FoundRTL->Idx + i];
+        Device.PendingGlobalsMtx.lock();
+        if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
+          for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
+            int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
+                1, true /*team*/);
+            if (rc != OFFLOAD_SUCCESS) {
+              DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
+            }
+          }
+          // Remove this library's entry from PendingCtorsDtors
+          Device.PendingCtorsDtors.erase(desc);
+        }
+        Device.PendingGlobalsMtx.unlock();
+      }
+
+      DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
+          DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
+
+      break;
+    }
+
+    // if no RTL was found proceed to unregister the next image
+    if (!FoundRTL){
+      DP("No RTLs in use support the image " DPxMOD "!\n",
+          DPxPTR(img->ImageStart));
+    }
+  }
+  RTLsMtx.unlock();
+  DP("Done unregistering images!\n");
+
+  // Remove entries from HostPtrToTableMap
+  TblMapMtx.lock();
+  for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
+      cur < desc->HostEntriesEnd; ++cur) {
+    HostPtrToTableMap.erase(cur->addr);
+  }
+
+  // Remove translation table for this descriptor.
+  auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin);
+  if (tt != HostEntriesBeginToTransTable.end()) {
+    DP("Removing translation table for descriptor " DPxMOD "\n",
+        DPxPTR(desc->HostEntriesBegin));
+    HostEntriesBeginToTransTable.erase(tt);
+  } else {
+    DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
+        "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
+  }
+
+  TblMapMtx.unlock();
+
+  // TODO: Remove RTL and the devices it manages if it's not used anymore?
+  // TODO: Write some RTL->unload_image(...) function?
+
+  DP("Done unregistering library!\n");
+}
diff --git a/final/libomptarget/src/rtl.h b/final/libomptarget/src/rtl.h
new file mode 100644
index 0000000..8148e81
--- /dev/null
+++ b/final/libomptarget/src/rtl.h
@@ -0,0 +1,171 @@
+//===------------ rtl.h - Target independent OpenMP target RTL ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations for handling RTL plugins.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_RTL_H
+#define _OMPTARGET_RTL_H
+
+#include <list>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+// Forward declarations.
+struct DeviceTy;
+struct __tgt_bin_desc;
+
+struct RTLInfoTy {
+  typedef int32_t(is_valid_binary_ty)(void *);
+  typedef int32_t(number_of_devices_ty)();
+  typedef int32_t(init_device_ty)(int32_t);
+  typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
+  typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
+  typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
+  typedef int32_t(data_delete_ty)(int32_t, void *);
+  typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
+                                 int32_t);
+  typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
+                                      int32_t, int32_t, int32_t, uint64_t);
+  typedef int64_t(init_requires_ty)(int64_t);
+
+  int32_t Idx;                     // RTL index, index is the number of devices
+                                   // of other RTLs that were registered before,
+                                   // i.e. the OpenMP index of the first device
+                                   // to be registered with this RTL.
+  int32_t NumberOfDevices;         // Number of devices this RTL deals with.
+
+  void *LibraryHandler;
+
+#ifdef OMPTARGET_DEBUG
+  std::string RTLName;
+#endif
+
+  // Functions implemented in the RTL.
+  is_valid_binary_ty *is_valid_binary;
+  number_of_devices_ty *number_of_devices;
+  init_device_ty *init_device;
+  load_binary_ty *load_binary;
+  data_alloc_ty *data_alloc;
+  data_submit_ty *data_submit;
+  data_retrieve_ty *data_retrieve;
+  data_delete_ty *data_delete;
+  run_region_ty *run_region;
+  run_team_region_ty *run_team_region;
+  init_requires_ty *init_requires;
+
+  // Are there images associated with this RTL.
+  bool isUsed;
+
+  // Mutex for thread-safety when calling RTL interface functions.
+  // It is easier to enforce thread-safety at the libomptarget level,
+  // so that developers of new RTLs do not have to worry about it.
+  std::mutex Mtx;
+
+  // The existence of the mutex above makes RTLInfoTy non-copyable.
+  // We need to provide a copy constructor explicitly.
+  RTLInfoTy()
+      : Idx(-1), NumberOfDevices(-1), LibraryHandler(0),
+#ifdef OMPTARGET_DEBUG
+        RTLName(),
+#endif
+        is_valid_binary(0), number_of_devices(0), init_device(0),
+        load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0),
+        data_delete(0), run_region(0), run_team_region(0),
+        init_requires(0), isUsed(false), Mtx() {}
+
+  RTLInfoTy(const RTLInfoTy &r) : Mtx() {
+    Idx = r.Idx;
+    NumberOfDevices = r.NumberOfDevices;
+    LibraryHandler = r.LibraryHandler;
+#ifdef OMPTARGET_DEBUG
+    RTLName = r.RTLName;
+#endif
+    is_valid_binary = r.is_valid_binary;
+    number_of_devices = r.number_of_devices;
+    init_device = r.init_device;
+    load_binary = r.load_binary;
+    data_alloc = r.data_alloc;
+    data_submit = r.data_submit;
+    data_retrieve = r.data_retrieve;
+    data_delete = r.data_delete;
+    run_region = r.run_region;
+    run_team_region = r.run_team_region;
+    init_requires = r.init_requires;
+    isUsed = r.isUsed;
+  }
+};
+
+/// RTLs identified in the system.
+class RTLsTy {
+private:
+  // Mutex-like object to guarantee thread-safety and unique initialization
+  // (i.e. the library attempts to load the RTLs (plugins) only once).
+  std::once_flag initFlag;
+  void LoadRTLs(); // not thread-safe
+
+public:
+  // List of the detected runtime libraries.
+  std::list<RTLInfoTy> AllRTLs;
+
+  // Array of pointers to the detected runtime libraries that have compatible
+  // binaries.
+  std::vector<RTLInfoTy *> UsedRTLs;
+
+  int64_t RequiresFlags;
+
+  explicit RTLsTy() {}
+
+  // Register the clauses of the requires directive.
+  void RegisterRequires(int64_t flags);
+
+  // Register a shared library with all (compatible) RTLs.
+  void RegisterLib(__tgt_bin_desc *desc);
+
+  // Unregister a shared library from all RTLs.
+  void UnregisterLib(__tgt_bin_desc *desc);
+};
+extern RTLsTy RTLs;
+extern std::mutex RTLsMtx;
+
+
+/// Map between the host entry begin and the translation table. Each
+/// registered library gets one TranslationTable. Use the map from
+/// __tgt_offload_entry so that we may quickly determine whether we
+/// are trying to (re)register an existing lib or really have a new one.
+struct TranslationTable {
+  __tgt_target_table HostTable;
+
+  // Image assigned to a given device.
+  std::vector<__tgt_device_image *> TargetsImages; // One image per device ID.
+
+  // Table of entry points or NULL if it was not already computed.
+  std::vector<__tgt_target_table *> TargetsTable; // One table per device ID.
+};
+typedef std::map<__tgt_offload_entry *, TranslationTable>
+    HostEntriesBeginToTransTableTy;
+extern HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
+extern std::mutex TrlTblMtx;
+
+/// Map between the host ptr and a table index
+struct TableMap {
+  TranslationTable *Table; // table associated with the host ptr.
+  uint32_t Index; // index in which the host ptr translated entry is found.
+  TableMap() : Table(0), Index(0) {}
+  TableMap(TranslationTable *table, uint32_t index)
+      : Table(table), Index(index) {}
+};
+typedef std::map<void *, TableMap> HostPtrToTableMapTy;
+extern HostPtrToTableMapTy HostPtrToTableMap;
+extern std::mutex TblMapMtx;
+
+#endif
diff --git a/final/libomptarget/test/CMakeLists.txt b/final/libomptarget/test/CMakeLists.txt
new file mode 100644
index 0000000..607801e
--- /dev/null
+++ b/final/libomptarget/test/CMakeLists.txt
@@ -0,0 +1,19 @@
+# CMakeLists.txt file for unit testing OpenMP offloading runtime library.
+if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR
+   OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0)
+  libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.")
+  libomptarget_warning_say("The check-libomptarget target will not be available!")
+  return()
+endif()
+
+if(LIBOMPTARGET_ENABLE_DEBUG)
+  set(LIBOMPTARGET_DEBUG True)
+else()
+  set(LIBOMPTARGET_DEBUG False)
+endif()
+
+add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp)
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/final/libomptarget/test/api/omp_get_num_devices.c b/final/libomptarget/test/api/omp_get_num_devices.c
new file mode 100644
index 0000000..d0e84db
--- /dev/null
+++ b/final/libomptarget/test/api/omp_get_num_devices.c
@@ -0,0 +1,36 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int test_omp_get_num_devices()
+{
+  /* checks that omp_get_num_devices() > 0 */
+  int num_devices = omp_get_num_devices();
+  printf("num_devices = %d\n", num_devices);
+
+  #pragma omp target
+  {}
+
+  return (num_devices > 0);
+}
+
+int main()
+{
+  int i;
+  int failed=0;
+
+  if (!test_omp_get_num_devices()) {
+    failed++;
+  }
+  if (failed)
+    printf("FAIL\n");
+  else
+    printf("PASS\n");
+  return failed;
+}
+
+// CHECK: PASS
diff --git a/final/libomptarget/test/env/omp_target_debug.c b/final/libomptarget/test/env/omp_target_debug.c
new file mode 100644
index 0000000..ce84c98
--- /dev/null
+++ b/final/libomptarget/test/env/omp_target_debug.c
@@ -0,0 +1,20 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG
+// REQUIRES: libomptarget-debug
+
+int main(void) {
+#pragma omp target
+  {}
+  return 0;
+}
+
+// DEBUG: Libomptarget
+// NDEBUG-NOT: Libomptarget
+// NDEBUG-NOT: Target
+
diff --git a/final/libomptarget/test/lit.cfg b/final/libomptarget/test/lit.cfg
new file mode 100644
index 0000000..4311605
--- /dev/null
+++ b/final/libomptarget/test/lit.cfg
@@ -0,0 +1,142 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def append_dynamic_library_path(name, value, sep):
+    if name in config.environment:
+        config.environment[name] = value + sep + config.environment[name]
+    else:
+        config.environment[name] = value
+
+# name: The name of this test suite.
+config.name = 'libomptarget'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp', '.cc']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.libomptarget_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.test_source_root + \
+    " -I " + config.omp_header_directory + \
+    " -L " + config.library_dir;
+
+if config.omp_host_rtl_directory:
+    config.test_flags = config.test_flags + " -L " + \
+        config.omp_host_rtl_directory
+
+config.test_flags = config.test_flags + " " + config.test_extra_flags
+
+# Allow REQUIRES / UNSUPPORTED / XFAIL to work
+config.target_triple = [ ]
+for feature in config.test_compiler_features:
+    config.available_features.add(feature)
+
+if config.libomptarget_debug:
+  config.available_features.add('libomptarget-debug')
+
+# Setup environment to find dynamic library at runtime
+if config.operating_system == 'Windows':
+    append_dynamic_library_path('PATH', config.library_dir, ";")
+    append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";")
+elif config.operating_system == 'Darwin':
+    append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":")
+    append_dynamic_library_path('DYLD_LIBRARY_PATH', \
+        config.omp_host_rtl_directory, ";")
+    config.test_flags += " -Wl,-rpath," + config.library_dir
+    config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
+else: # Unices
+    append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
+    append_dynamic_library_path('LD_LIBRARY_PATH', \
+        config.omp_host_rtl_directory, ":")
+
+# substitutions
+# - for targets that exist in the system create the actual command.
+# - for valid targets that do not exist in the system, return false, so that the
+#   same test can be used for different targets.
+
+# Scan all the valid targets.
+for libomptarget_target in config.libomptarget_all_targets:
+    # Is this target in the current system? If so create a compile, run and test
+    # command. Otherwise create command that return false.
+    if libomptarget_target in config.libomptarget_system_targets:
+        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+            libomptarget_target, \
+            "%libomptarget-compilexx-and-run-" + libomptarget_target + \
+            " | " + config.libomptarget_filecheck + " %s"))
+        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+            libomptarget_target, \
+            "%libomptarget-compile-and-run-" + libomptarget_target + \
+            " | " + config.libomptarget_filecheck + " %s"))
+        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+            libomptarget_target, \
+            "%libomptarget-compilexx-" + libomptarget_target + " && " + \
+            "%libomptarget-run-" + libomptarget_target))
+        config.substitutions.append(("%libomptarget-compile-and-run-" + \
+            libomptarget_target, \
+            "%libomptarget-compile-" + libomptarget_target + " && " + \
+            "%libomptarget-run-" + libomptarget_target))
+        config.substitutions.append(("%libomptarget-compilexx-" + \
+            libomptarget_target, \
+            "%clangxx-" + libomptarget_target + " %s -o %t-" + \
+            libomptarget_target))
+        config.substitutions.append(("%libomptarget-compile-" + \
+            libomptarget_target, \
+            "%clang-" + libomptarget_target + " %s -o %t-" + \
+            libomptarget_target))
+        config.substitutions.append(("%libomptarget-run-" + \
+            libomptarget_target, \
+            "%t-" + libomptarget_target))
+        config.substitutions.append(("%clangxx-" + libomptarget_target, \
+            "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
+        config.substitutions.append(("%clang-" + libomptarget_target, \
+            "%clang %openmp_flags %flags -fopenmp-targets=" + libomptarget_target))
+        config.substitutions.append(("%fcheck-" + libomptarget_target, \
+            config.libomptarget_filecheck + " %s"))
+    else:
+        config.substitutions.append(("%libomptarget-compile-run-and-check-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compile-and-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-and-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compilexx-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-compile-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%libomptarget-run-" + \
+            libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%clang-" + libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%clangxx-" + libomptarget_target, \
+            "echo ignored-command"))
+        config.substitutions.append(("%fcheck-" + libomptarget_target, \
+            "echo ignored-command"))
+
+config.substitutions.append(("%clangxx", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%flags", config.test_flags))
diff --git a/final/libomptarget/test/lit.site.cfg.in b/final/libomptarget/test/lit.site.cfg.in
new file mode 100644
index 0000000..26ef492
--- /dev/null
+++ b/final/libomptarget/test/lit.site.cfg.in
@@ -0,0 +1,19 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
+config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
+config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
+config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split()
+config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split()
+config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/final/libomptarget/test/mapping/pr38704.c b/final/libomptarget/test/mapping/pr38704.c
new file mode 100644
index 0000000..3e7135e
--- /dev/null
+++ b/final/libomptarget/test/mapping/pr38704.c
@@ -0,0 +1,47 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+// Clang 6.0 doesn't use the new map interface, undefined behavior when
+// the compiler emits "old" interface code for structures.
+// UNSUPPORTED: clang-6
+
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct {
+  int *ptr1;
+  int *ptr2;
+} StructWithPtrs;
+
+int main(int argc, char *argv[]) {
+  StructWithPtrs s, s2;
+  s.ptr1 = malloc(sizeof(int));
+  s.ptr2 = malloc(2 * sizeof(int));
+  s2.ptr1 = malloc(sizeof(int));
+  s2.ptr2 = malloc(2 * sizeof(int));
+
+#pragma omp target enter data map(to: s2.ptr2[0:1])
+#pragma omp target map(s.ptr1[0:1], s.ptr2[0:2])
+  {
+    s.ptr1[0] = 1;
+    s.ptr2[0] = 2;
+    s.ptr2[1] = 3;
+  }
+#pragma omp target exit data map(from: s2.ptr1[0:1], s2.ptr2[0:1])
+
+  // CHECK: s.ptr1[0] = 1
+  // CHECK: s.ptr2[0] = 2
+  // CHECK: s.ptr2[1] = 3
+  printf("s.ptr1[0] = %d\n", s.ptr1[0]);
+  printf("s.ptr2[0] = %d\n", s.ptr2[0]);
+  printf("s.ptr2[1] = %d\n", s.ptr2[1]);
+
+  free(s.ptr1);
+  free(s.ptr2);
+  free(s2.ptr1);
+  free(s2.ptr2);
+
+  return 0;
+}
diff --git a/final/libomptarget/test/offloading/looptripcnt.c b/final/libomptarget/test/offloading/looptripcnt.c
new file mode 100644
index 0000000..025231b
--- /dev/null
+++ b/final/libomptarget/test/offloading/looptripcnt.c
@@ -0,0 +1,36 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// REQUIRES: libomptarget-debug
+
+/*
+  Test for looptripcount being popped from runtime stack.
+*/
+#include <stdio.h>
+#include <omp.h>
+int main()
+{
+  int N = 128;
+  int NN = 1024;
+  int num_teams[NN];
+  int num_threads[NN];
+
+  printf("#pragma omp target teams distribute parallel for thread_limit(4)\n");
+#pragma omp target teams distribute parallel for thread_limit(4)
+  for (int j = 0; j< N; j++) {
+    num_threads[j] = omp_get_num_threads();
+    num_teams[j] = omp_get_num_teams();
+  }
+  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
+// DEBUG: loop trip count is 128
+  printf("#pragma omp target teams distribute parallel for\n");
+#pragma omp target teams distribute parallel for
+  for (int j = 0; j< N; j++) {
+    num_threads[j] = omp_get_num_threads();
+    num_teams[j] = omp_get_num_teams();
+  }
+  printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]);
+// DEBUG: loop trip count is 128
+  return 0;
+}
diff --git a/final/libomptarget/test/offloading/offloading_success.c b/final/libomptarget/test/offloading/offloading_success.c
new file mode 100644
index 0000000..12e78fa
--- /dev/null
+++ b/final/libomptarget/test/offloading/offloading_success.c
@@ -0,0 +1,23 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = -1;
+
+#pragma omp target map(from: isHost)
+  { isHost = omp_is_initial_device(); }
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return isHost;
+}
diff --git a/final/libomptarget/test/offloading/offloading_success.cpp b/final/libomptarget/test/offloading/offloading_success.cpp
new file mode 100644
index 0000000..eecd97a
--- /dev/null
+++ b/final/libomptarget/test/offloading/offloading_success.cpp
@@ -0,0 +1,23 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+int main(void) {
+  int isHost = 0;
+
+#pragma omp target map(from: isHost)
+  { isHost = omp_is_initial_device(); }
+
+  if (isHost < 0) {
+    printf("Runtime error, isHost=%d\n", isHost);
+  }
+
+  // CHECK: Target region executed on the device
+  printf("Target region executed on the %s\n", isHost ? "host" : "device");
+
+  return isHost;
+}
diff --git a/final/libomptarget/test/offloading/requires.c b/final/libomptarget/test/offloading/requires.c
new file mode 100644
index 0000000..7f014d3
--- /dev/null
+++ b/final/libomptarget/test/offloading/requires.c
@@ -0,0 +1,46 @@
+// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG
+// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG
+// REQUIRES: libomptarget-debug
+
+/*
+  Test for the 'requires' clause check.
+  When a target region is used, the requires flags are set in the
+  runtime for the entire compilation unit. If the flags are set again,
+  (for whatever reason) the set must be consistent with previously
+  set values.
+*/
+#include <stdio.h>
+#include <omp.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+void run_reg_requires() {
+  // Before the target region is registered, the requires registers the status
+  // of the requires clauses. Since there are no requires clauses in this file
+  // the flags state can only be OMP_REQ_NONE i.e. 1.
+
+  // This is the 2nd time this function is called so it should print the debug
+  // info belonging to the check.
+  __tgt_register_requires(1);
+  __tgt_register_requires(1);
+  // DEBUG: New requires flags 1 compatible with existing 1!
+}
+
+// ---------------------------------------------------------------------------
+int main() {
+  run_reg_requires();
+
+// This also runs reg requires for the first time.
+#pragma omp target
+  {}
+
+  return 0;
+}
\ No newline at end of file
diff --git a/final/libomptarget/test/offloading/target_depend_nowait.cpp b/final/libomptarget/test/offloading/target_depend_nowait.cpp
new file mode 100644
index 0000000..2c1c7e7
--- /dev/null
+++ b/final/libomptarget/test/offloading/target_depend_nowait.cpp
@@ -0,0 +1,62 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 1024
+
+int A[N];
+int B[N];
+int C[N];
+int main() {
+  for (int i = 0; i < N; i++)
+    A[i] = B[i] = i;
+
+#pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 1) {
+// map data A & B and move to
+#pragma omp target enter data map(to : A, B) depend(out : A[0]) nowait
+
+// no data move since already mapped
+#pragma omp target map(A, B) depend(out : A[0]) nowait
+      {
+        for (int i = 0; i < N; i++)
+          ++A[i];
+        for (int i = 0; i < N; i++)
+          ++B[i];
+      }
+
+// no data move since already mapped
+#pragma omp target teams num_teams(1) map(A, B) depend(out : A[0]) nowait
+      {
+        for (int i = 0; i < N; i++)
+          ++A[i];
+        for (int i = 0; i < N; i++)
+          ++B[i];
+      }
+
+// A updated via update
+#pragma omp target update from(A) depend(out : A[0]) nowait
+
+// B updated via exit, A just released
+#pragma omp target exit data map(release                                       \
+                                 : A) map(from                                 \
+                                          : B) depend(out                      \
+                                                      : A[0]) nowait
+    } // if
+  }   // parallel
+
+  int Sum = 0;
+  for (int i = 0; i < N; i++)
+    Sum += A[i] + B[i];
+  // Sum is 2 * N * (2 + N - 1 + 2) / 2
+  // CHECK: Sum = 1051648.
+  printf("Sum = %d.\n", Sum);
+
+  return Sum != 2 * N * (2 + N - 1 + 2) / 2;
+}
+
diff --git a/final/runtime/.clang-format b/final/runtime/.clang-format
new file mode 100644
index 0000000..590e1e2
--- /dev/null
+++ b/final/runtime/.clang-format
@@ -0,0 +1,5 @@
+---
+BasedOnStyle:  LLVM
+AlignTrailingComments: false
+SortIncludes:    false
+...
diff --git a/final/runtime/CMakeLists.txt b/final/runtime/CMakeLists.txt
new file mode 100644
index 0000000..8087b23
--- /dev/null
+++ b/final/runtime/CMakeLists.txt
@@ -0,0 +1,379 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
+  message(FATAL_ERROR "Direct configuration not supported, please use parent directory!")
+endif()
+
+# Add cmake directory to search for custom cmake functions
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+# Set libomp version
+set(LIBOMP_VERSION_MAJOR 5)
+set(LIBOMP_VERSION_MINOR 0)
+
+# These include files are in the cmake/ subdirectory
+include(LibompUtils)
+include(LibompGetArchitecture)
+include(LibompHandleFlags)
+include(LibompDefinitions)
+
+# Determine the target architecture
+if(${OPENMP_STANDALONE_BUILD})
+  # If adding a new architecture, take a look at cmake/LibompGetArchitecture.cmake
+  libomp_get_architecture(LIBOMP_DETECTED_ARCH)
+  set(LIBOMP_ARCH ${LIBOMP_DETECTED_ARCH} CACHE STRING
+    "The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64).")
+  # Should assertions be enabled?  They are on by default.
+  set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
+    "enable assertions?")
+else() # Part of LLVM build
+  # Determine the native architecture from LLVM.
+  string(TOLOWER "${LLVM_TARGET_ARCH}" LIBOMP_NATIVE_ARCH)
+  if( LIBOMP_NATIVE_ARCH STREQUAL "host" )
+    string(REGEX MATCH "^[^-]*" LIBOMP_NATIVE_ARCH ${LLVM_HOST_TRIPLE})
+  endif ()
+  if(LIBOMP_NATIVE_ARCH MATCHES "i[2-6]86")
+    set(LIBOMP_ARCH i386)
+  elseif(LIBOMP_NATIVE_ARCH STREQUAL "x86")
+    set(LIBOMP_ARCH i386)
+  elseif(LIBOMP_NATIVE_ARCH STREQUAL "amd64")
+    set(LIBOMP_ARCH x86_64)
+  elseif(LIBOMP_NATIVE_ARCH STREQUAL "x86_64")
+    set(LIBOMP_ARCH x86_64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "powerpc64le")
+    set(LIBOMP_ARCH ppc64le)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "powerpc")
+    set(LIBOMP_ARCH ppc64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "aarch64")
+    set(LIBOMP_ARCH aarch64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "arm64")
+    set(LIBOMP_ARCH aarch64)
+  elseif(LIBOMP_NATIVE_ARCH MATCHES "arm")
+    set(LIBOMP_ARCH arm)
+  else()
+    # last ditch effort
+    libomp_get_architecture(LIBOMP_ARCH)
+  endif ()
+  set(LIBOMP_ENABLE_ASSERTIONS ${LLVM_ENABLE_ASSERTIONS})
+endif()
+libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 mic mips mips64)
+
+set(LIBOMP_LIB_TYPE normal CACHE STRING
+  "Performance,Profiling,Stubs library (normal/profile/stubs)")
+libomp_check_variable(LIBOMP_LIB_TYPE normal profile stubs)
+# Set the OpenMP Year and Month assiociated with version
+set(LIBOMP_OMP_YEAR_MONTH 201611)
+set(LIBOMP_MIC_ARCH knc CACHE STRING
+  "Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) (knf/knc).  Ignored if not Intel(R) MIC Architecture build.")
+if("${LIBOMP_ARCH}" STREQUAL "mic")
+  libomp_check_variable(LIBOMP_MIC_ARCH knf knc)
+endif()
+set(LIBOMP_FORTRAN_MODULES FALSE CACHE BOOL
+  "Create Fortran module files? (requires fortran compiler)")
+
+# - Support for universal fat binary builds on Mac
+# - Having this extra variable allows people to build this library as a universal library
+#   without forcing a universal build of the llvm/clang compiler.
+set(LIBOMP_OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}" CACHE STRING
+  "For Mac builds, semicolon separated list of architectures to build for universal fat binary.")
+set(CMAKE_OSX_ARCHITECTURES ${LIBOMP_OSX_ARCHITECTURES})
+
+# Should @rpath be used for dynamic libraries on Mac?
+# The if(NOT DEFINED) is there to guard a cached value of the variable if one
+# exists so there is no interference with what the user wants.  Also, no cache entry
+# is created so there are no inadvertant effects on other parts of LLVM.
+if(NOT DEFINED CMAKE_MACOSX_RPATH)
+  set(CMAKE_MACOSX_RPATH TRUE)
+endif()
+
+# User specified flags.  These are appended to the configured flags.
+set(LIBOMP_CFLAGS "" CACHE STRING
+  "Appended user specified C compiler flags.")
+set(LIBOMP_CXXFLAGS "" CACHE STRING
+  "Appended user specified C++ compiler flags.")
+set(LIBOMP_CPPFLAGS "" CACHE STRING
+  "Appended user specified C preprocessor flags.")
+set(LIBOMP_ASMFLAGS "" CACHE STRING
+  "Appended user specified assembler flags.")
+set(LIBOMP_LDFLAGS "" CACHE STRING
+  "Appended user specified linker flags.")
+set(LIBOMP_LIBFLAGS "" CACHE STRING
+  "Appended user specified linked libs flags. (e.g., -lm)")
+set(LIBOMP_FFLAGS "" CACHE STRING
+  "Appended user specified Fortran compiler flags.  These are only used if LIBOMP_FORTRAN_MODULES==TRUE.")
+
+# Should the libomp library and generated headers be copied into the original source exports/ directory
+# Turning this to FALSE aids parallel builds to not interfere with each other.
+# Currently, the testsuite module expects the just built OpenMP library to be located inside the exports/
+# directory.  TODO: have testsuite run under llvm-lit directly.  We can then get rid of copying to exports/
+set(LIBOMP_COPY_EXPORTS TRUE CACHE STRING
+  "Should exports be copied into source exports/ directory?")
+
+# HWLOC-support
+set(LIBOMP_USE_HWLOC FALSE CACHE BOOL
+  "Use Hwloc (http://www.open-mpi.org/projects/hwloc/) library for affinity?")
+set(LIBOMP_HWLOC_INSTALL_DIR /usr/local CACHE PATH
+  "Install path for hwloc library")
+
+# Get the build number from kmp_version.cpp
+libomp_get_build_number("${CMAKE_CURRENT_SOURCE_DIR}" LIBOMP_VERSION_BUILD)
+math(EXPR LIBOMP_VERSION_BUILD_YEAR "${LIBOMP_VERSION_BUILD}/10000")
+math(EXPR LIBOMP_VERSION_BUILD_MONTH_DAY "${LIBOMP_VERSION_BUILD}%10000")
+
+# Currently don't record any timestamps
+set(LIBOMP_BUILD_DATE "No_Timestamp")
+
+# Architecture
+set(IA32 FALSE)
+set(INTEL64 FALSE)
+set(ARM FALSE)
+set(AARCH64 FALSE)
+set(PPC64BE FALSE)
+set(PPC64LE FALSE)
+set(PPC64 FALSE)
+set(MIC FALSE)
+set(MIPS64 FALSE)
+set(MIPS FALSE)
+if("${LIBOMP_ARCH}" STREQUAL "i386" OR "${LIBOMP_ARCH}" STREQUAL "32")    # IA-32 architecture
+  set(IA32 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "x86_64" OR "${LIBOMP_ARCH}" STREQUAL "32e") # Intel(R) 64 architecture
+  set(INTEL64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "arm") # ARM architecture
+  set(ARM TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "ppc64") # PPC64BE architecture
+  set(PPC64BE TRUE)
+  set(PPC64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "ppc64le") # PPC64LE architecture
+  set(PPC64LE TRUE)
+  set(PPC64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "aarch64") # AARCH64 architecture
+  set(AARCH64 TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "mic") # Intel(R) Many Integrated Core Architecture
+  set(MIC TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "mips") # MIPS architecture
+    set(MIPS TRUE)
+elseif("${LIBOMP_ARCH}" STREQUAL "mips64") # MIPS64 architecture
+    set(MIPS64 TRUE)
+endif()
+
+# Set some flags based on build_type
+set(RELEASE_BUILD FALSE)
+set(DEBUG_BUILD FALSE)
+set(RELWITHDEBINFO_BUILD FALSE)
+set(MINSIZEREL_BUILD FALSE)
+string(TOLOWER "${CMAKE_BUILD_TYPE}" libomp_build_type_lowercase)
+if("${libomp_build_type_lowercase}" STREQUAL "release")
+  set(RELEASE_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "debug")
+  set(DEBUG_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "relwithdebinfo")
+  set(RELWITHDEBINFO_BUILD TRUE)
+elseif("${libomp_build_type_lowercase}" STREQUAL "minsizerel")
+  set(MINSIZEREL_BUILD TRUE)
+endif()
+
+# Include itt notify interface?
+set(LIBOMP_USE_ITT_NOTIFY TRUE CACHE BOOL
+  "Enable ITT notify?")
+
+# normal, profile, stubs library.
+set(NORMAL_LIBRARY FALSE)
+set(STUBS_LIBRARY FALSE)
+set(PROFILE_LIBRARY FALSE)
+if("${LIBOMP_LIB_TYPE}" STREQUAL "normal")
+  set(NORMAL_LIBRARY TRUE)
+elseif("${LIBOMP_LIB_TYPE}" STREQUAL "profile")
+  set(PROFILE_LIBRARY TRUE)
+elseif("${LIBOMP_LIB_TYPE}" STREQUAL "stubs")
+  set(STUBS_LIBRARY TRUE)
+endif()
+
+# Setting directory names
+set(LIBOMP_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(LIBOMP_SRC_DIR ${LIBOMP_BASE_DIR}/src)
+set(LIBOMP_TOOLS_DIR ${LIBOMP_BASE_DIR}/tools)
+set(LIBOMP_INC_DIR ${LIBOMP_SRC_DIR}/include)
+set(LIBOMP_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+# Enabling Fortran if it is needed
+if(${LIBOMP_FORTRAN_MODULES})
+  enable_language(Fortran)
+endif()
+# Enable MASM Compiler if it is needed (Windows only)
+if(WIN32)
+  enable_language(ASM_MASM)
+endif()
+
+# Getting legal type/arch
+libomp_get_legal_type(LIBOMP_LEGAL_TYPE)
+libomp_get_legal_arch(LIBOMP_LEGAL_ARCH)
+
+# Compiler flag checks, library checks, threading check, etc.
+include(config-ix)
+
+# Is there a quad precision data type available?
+# TODO: Make this a real feature check
+set(LIBOMP_USE_QUAD_PRECISION "${LIBOMP_HAVE_QUAD_PRECISION}" CACHE BOOL
+  "Should 128-bit precision entry points be built?")
+if(LIBOMP_USE_QUAD_PRECISION AND (NOT LIBOMP_HAVE_QUAD_PRECISION))
+  libomp_error_say("128-bit quad precision functionality requested but not available")
+endif()
+
+# libgomp drop-in compatibility requires versioned symbols
+set(LIBOMP_USE_VERSION_SYMBOLS "${LIBOMP_HAVE_VERSION_SYMBOLS}" CACHE BOOL
+  "Should version symbols be used? These provide binary compatibility with libgomp.")
+if(LIBOMP_USE_VERSION_SYMBOLS AND (NOT LIBOMP_HAVE_VERSION_SYMBOLS))
+  libomp_error_say("Version symbols functionality requested but not available")
+endif()
+
+# On multinode systems, larger alignment is desired to avoid false sharing
+set(LIBOMP_USE_INTERNODE_ALIGNMENT FALSE CACHE BOOL
+  "Should larger alignment (4096 bytes) be used for some locks and data structures?")
+
+# Build code that allows the OpenMP library to conveniently interface with debuggers
+set(LIBOMP_USE_DEBUGGER FALSE CACHE BOOL
+  "Enable debugger interface code?")
+
+# Should we link to C++ library?
+set(LIBOMP_USE_STDCPPLIB FALSE CACHE BOOL
+  "Should we link to C++ library?")
+
+# Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) based locks have
+# __asm code which can be troublesome for some compilers.  This feature is also x86 specific.
+# TODO: Make this a real feature check
+set(LIBOMP_USE_ADAPTIVE_LOCKS "${LIBOMP_HAVE_ADAPTIVE_LOCKS}" CACHE BOOL
+  "Should Intel(R) TSX lock be compiled (adaptive lock in kmp_lock.cpp).  These are x86 specific.")
+if(LIBOMP_USE_ADAPTIVE_LOCKS AND (NOT LIBOMP_HAVE_ADAPTIVE_LOCKS))
+  libomp_error_say("Adaptive locks (Intel(R) TSX) functionality is only supported on x86 Architecture")
+endif()
+
+# - stats-gathering enables OpenMP stats where things like the number of
+# parallel regions, clock ticks spent in particular openmp regions are recorded.
+set(LIBOMP_STATS FALSE CACHE BOOL
+  "Stats-Gathering functionality?")
+if(LIBOMP_STATS AND (NOT LIBOMP_HAVE_STATS))
+  libomp_error_say("Stats-gathering functionality requested but not available")
+endif()
+# The stats functionality requires the std c++ library
+if(LIBOMP_STATS)
+  set(LIBOMP_USE_STDCPPLIB TRUE)
+endif()
+
+# Shared library can be switched to a static library
+set(LIBOMP_ENABLE_SHARED TRUE CACHE BOOL
+  "Shared library instead of static library?")
+
+if(WIN32 AND NOT LIBOMP_ENABLE_SHARED)
+  libomp_error_say("Static libraries requested but not available on Windows")
+endif()
+
+if(LIBOMP_USE_ITT_NOTIFY AND NOT LIBOMP_ENABLE_SHARED)
+  message(STATUS "ITT Notify not supported for static libraries - forcing ITT Notify off")
+  set(LIBOMP_USE_ITT_NOTIFY FALSE)
+endif()
+
+if(LIBOMP_USE_VERSION_SYMBOLS AND (NOT LIBOMP_ENABLE_SHARED) )
+  message(STATUS "Version symbols not supported for static libraries - forcing Version symbols functionality off")
+  set (LIBOMP_USE_VERSION_SYMBOLS FALSE)
+endif()
+
+# OMPT-support defaults to ON for OpenMP 5.0+ and if the requirements in
+# cmake/config-ix.cmake are fulfilled.
+set(OMPT_DEFAULT FALSE)
+if ((LIBOMP_HAVE_OMPT_SUPPORT) AND (NOT WIN32))
+  set(OMPT_DEFAULT TRUE)
+endif()
+set(LIBOMP_OMPT_SUPPORT ${OMPT_DEFAULT} CACHE BOOL
+  "OMPT-support?")
+
+set(LIBOMP_OMPT_DEBUG FALSE CACHE BOOL
+  "Trace OMPT initialization?")
+set(LIBOMP_OMPT_OPTIONAL TRUE CACHE BOOL
+  "OMPT-optional?")
+if(LIBOMP_OMPT_SUPPORT AND (NOT LIBOMP_HAVE_OMPT_SUPPORT))
+  libomp_error_say("OpenMP Tools Interface requested but not available in this implementation")
+endif()
+
+# TSAN-support
+set(LIBOMP_TSAN_SUPPORT FALSE CACHE BOOL
+  "TSAN-support?")
+if(LIBOMP_TSAN_SUPPORT AND (NOT LIBOMP_HAVE_TSAN_SUPPORT))
+  libomp_error_say("TSAN functionality requested but not available")
+endif()
+
+# Error check hwloc support after config-ix has run
+if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC))
+  libomp_error_say("Hwloc requested but not available")
+endif()
+
+# Hierarchical scheduling support
+set(LIBOMP_USE_HIER_SCHED FALSE CACHE BOOL
+  "Hierarchical scheduling support?")
+
+# Setting final library name
+set(LIBOMP_DEFAULT_LIB_NAME libomp)
+if(${PROFILE_LIBRARY})
+  set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}prof)
+endif()
+if(${STUBS_LIBRARY})
+  set(LIBOMP_DEFAULT_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME}stubs)
+endif()
+set(LIBOMP_LIB_NAME ${LIBOMP_DEFAULT_LIB_NAME} CACHE STRING "Base OMP library name")
+
+if(${LIBOMP_ENABLE_SHARED})
+  set(LIBOMP_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(LIBOMP_LIBRARY_KIND SHARED)
+  set(LIBOMP_INSTALL_KIND LIBRARY)
+else()
+  set(LIBOMP_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(LIBOMP_LIBRARY_KIND STATIC)
+  set(LIBOMP_INSTALL_KIND ARCHIVE)
+endif()
+
+set(LIBOMP_LIB_FILE ${LIBOMP_LIB_NAME}${LIBOMP_LIBRARY_SUFFIX})
+
+# Optional backwards compatibility aliases.
+set(LIBOMP_INSTALL_ALIASES TRUE CACHE BOOL
+  "Install libgomp and libiomp5 library aliases for backwards compatibility")
+
+# Print configuration after all variables are set.
+if(${OPENMP_STANDALONE_BUILD})
+  libomp_say("Operating System     -- ${CMAKE_SYSTEM_NAME}")
+  libomp_say("Target Architecture  -- ${LIBOMP_ARCH}")
+  if(${MIC})
+    libomp_say("Intel(R) MIC Architecture    -- ${LIBOMP_MIC_ARCH}")
+  endif()
+  libomp_say("Build Type           -- ${CMAKE_BUILD_TYPE}")
+  libomp_say("Library Kind         -- ${LIBOMP_LIBRARY_KIND}")
+  libomp_say("Library Type         -- ${LIBOMP_LIB_TYPE}")
+  libomp_say("Fortran Modules      -- ${LIBOMP_FORTRAN_MODULES}")
+  # will say development if all zeros
+  if(${LIBOMP_VERSION_BUILD} STREQUAL 00000000)
+    set(LIBOMP_BUILD Development)
+  else()
+    set(LIBOMP_BUILD ${LIBOMP_VERSION_BUILD})
+  endif()
+  libomp_say("Build                -- ${LIBOMP_BUILD}")
+  libomp_say("Use Stats-gathering  -- ${LIBOMP_STATS}")
+  libomp_say("Use Debugger-support -- ${LIBOMP_USE_DEBUGGER}")
+  libomp_say("Use ITT notify       -- ${LIBOMP_USE_ITT_NOTIFY}")
+  libomp_say("Use OMPT-support     -- ${LIBOMP_OMPT_SUPPORT}")
+  if(${LIBOMP_OMPT_SUPPORT})
+    libomp_say("Use OMPT-optional  -- ${LIBOMP_OMPT_OPTIONAL}")
+  endif()
+  libomp_say("Use Adaptive locks   -- ${LIBOMP_USE_ADAPTIVE_LOCKS}")
+  libomp_say("Use quad precision   -- ${LIBOMP_USE_QUAD_PRECISION}")
+  libomp_say("Use TSAN-support     -- ${LIBOMP_TSAN_SUPPORT}")
+  libomp_say("Use Hwloc library    -- ${LIBOMP_USE_HWLOC}")
+endif()
+
+add_subdirectory(src)
+add_subdirectory(test)
diff --git a/final/runtime/README.txt b/final/runtime/README.txt
new file mode 100644
index 0000000..ab19634
--- /dev/null
+++ b/final/runtime/README.txt
@@ -0,0 +1,116 @@
+
+               README for the LLVM* OpenMP* Runtime Library
+               ============================================
+
+How to Build Documentation
+==========================
+
+The main documentation is in Doxygen* format, and this distribution
+should come with pre-built PDF documentation in doc/Reference.pdf.
+However, an HTML version can be built by executing:
+
+% doxygen doc/doxygen/config
+
+in the runtime directory.
+
+That will produce HTML documentation in the doc/doxygen/generated
+directory, which can be accessed by pointing a web browser at the
+index.html file there.
+
+If you don't have Doxygen installed, you can download it from
+www.doxygen.org.
+
+
+How to Build the LLVM* OpenMP* Runtime Library
+==============================================
+In-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omp
+
+Out-of-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp
+$ cd where-you-want-to-live/openmp/runtime
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+
+For details about building, please look at README.rst in the parent directory.
+
+Architectures Supported
+=======================
+* IA-32 architecture
+* Intel(R) 64 architecture
+* Intel(R) Many Integrated Core Architecture
+* ARM* architecture
+* Aarch64 (64-bit ARM) architecture
+* IBM(R) Power architecture (big endian)
+* IBM(R) Power architecture (little endian)
+* MIPS and MIPS64 architecture
+
+Supported RTL Build Configurations
+==================================
+
+Supported Architectures: IA-32 architecture, Intel(R) 64, and
+Intel(R) Many Integrated Core Architecture
+
+              ----------------------------------------------
+              |   icc/icl     |    gcc      |   clang      |
+--------------|---------------|----------------------------|
+| Linux* OS   |   Yes(1,5)    |  Yes(2,4)   | Yes(4,6,7)   |
+| FreeBSD*    |   No          |  No         | Yes(4,6,7,8) |
+| OS X*       |   Yes(1,3,4)  |  No         | Yes(4,6,7)   |
+| Windows* OS |   Yes(1,4)    |  No         | No           |
+------------------------------------------------------------
+
+(1) On IA-32 architecture and Intel(R) 64, icc/icl versions 12.x are
+    supported (12.1 is recommended).
+(2) GCC* version 4.7 is supported.
+(3) For icc on OS X*, OS X* version 10.5.8 is supported.
+(4) Intel(R) Many Integrated Core Architecture not supported.
+(5) On Intel(R) Many Integrated Core Architecture, icc/icl versions 13.0
+    or later are required.
+(6) Clang* version 3.3 is supported.
+(7) Clang* currently does not offer a software-implemented 128 bit extended
+    precision type.  Thus, all entry points reliant on this type are removed
+    from the library and cannot be called in the user program.  The following
+    functions are not available:
+    __kmpc_atomic_cmplx16_*
+    __kmpc_atomic_float16_*
+    __kmpc_atomic_*_fp
+(8) Community contribution provided AS IS, not tested by Intel.
+
+Supported Architectures: IBM(R) Power 7 and Power 8
+
+              -----------------------------
+              |   gcc      |   clang      |
+--------------|------------|--------------|
+| Linux* OS   |  Yes(1,2)  | Yes(3,4)     |
+-------------------------------------------
+
+(1) On Power 7, gcc version 4.8.2 is supported.
+(2) On Power 8, gcc version 4.8.2 is supported.
+(3) On Power 7, clang version 3.7 is supported.
+(4) On Power 8, clang version 3.7 is supported.
+
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL: clang (from the OpenMP development branch at
+http://clang-omp.github.io/ ), Intel compilers, GCC.  See the documentation
+for more details.
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+
+*Other names and brands may be claimed as the property of others.
diff --git a/final/runtime/cmake/LibompCheckFortranFlag.cmake b/final/runtime/cmake/LibompCheckFortranFlag.cmake
new file mode 100644
index 0000000..21837ef
--- /dev/null
+++ b/final/runtime/cmake/LibompCheckFortranFlag.cmake
@@ -0,0 +1,72 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Checking a fortran compiler flag
+# There is no real trivial way to do this in CMake, so we implement it here
+# this will have ${boolean} = TRUE if the flag succeeds, otherwise false.
+function(libomp_check_fortran_flag flag boolean)
+  if(NOT DEFINED "${boolean}")
+    set(retval TRUE)
+    set(fortran_source
+"      program hello
+           print *, \"Hello World!\"
+      end program hello")
+
+  set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping")
+  if(CMAKE_VERSION VERSION_GREATER 3.1 OR CMAKE_VERSION VERSION_EQUAL 3.1)
+    include(CheckFortranSourceCompiles)
+    check_fortran_source_compiles("${fortran_source}" ${boolean} FAIL_REGEX "${failed_regexes}")
+    set(${boolean} ${${boolean}} PARENT_SCOPE)
+    return()
+  else()
+    # Our manual check for cmake versions that don't have CheckFortranSourceCompiles
+    set(base_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/fortran_flag_check)
+    file(MAKE_DIRECTORY ${base_dir})
+    file(WRITE ${base_dir}/fortran_source.f "${fortran_source}")
+
+    message(STATUS "Performing Test ${boolean}")
+    execute_process(
+      COMMAND ${CMAKE_Fortran_COMPILER} "${flag}" ${base_dir}/fortran_source.f
+      WORKING_DIRECTORY ${base_dir}
+      RESULT_VARIABLE exit_code
+      OUTPUT_VARIABLE OUTPUT
+      ERROR_VARIABLE OUTPUT
+    )
+
+    if(${exit_code} EQUAL 0)
+      foreach(regex IN LISTS failed_regexes)
+        if("${OUTPUT}" MATCHES ${regex})
+          set(retval FALSE)
+        endif()
+      endforeach()
+    else()
+      set(retval FALSE)
+    endif()
+
+    if(${retval})
+      set(${boolean} 1 CACHE INTERNAL "Test ${boolean}")
+      message(STATUS "Performing Test ${boolean} - Success")
+      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+        "Performing Fortran Compiler Flag test ${boolean} succeeded with the following output:\n"
+        "${OUTPUT}\n"
+        "Source file was:\n${fortran_source}\n")
+    else()
+      set(${boolean} "" CACHE INTERNAL "Test ${boolean}")
+      message(STATUS "Performing Test ${boolean} - Failed")
+      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+        "Performing Fortran Compiler Flag test ${boolean} failed with the following output:\n"
+        "${OUTPUT}\n"
+        "Source file was:\n${fortran_source}\n")
+    endif()
+  endif()
+
+  set(${boolean} ${retval} PARENT_SCOPE)
+  endif()
+endfunction()
diff --git a/final/runtime/cmake/LibompCheckLinkerFlag.cmake b/final/runtime/cmake/LibompCheckLinkerFlag.cmake
new file mode 100644
index 0000000..81ce9b0
--- /dev/null
+++ b/final/runtime/cmake/LibompCheckLinkerFlag.cmake
@@ -0,0 +1,67 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Checking a linker flag to build a shared library
+# There is no real trivial way to do this in CMake, so we implement it here
+# this will have ${boolean} = TRUE if the flag succeeds, otherwise FALSE.
+function(libomp_check_linker_flag flag boolean)
+  if(NOT DEFINED "${boolean}")
+  set(retval TRUE)
+  set(library_source
+    "int foo(int a) { return a*a; }")
+  set(cmake_source
+    "cmake_minimum_required(VERSION 2.8)
+     project(foo C)
+     set(CMAKE_SHARED_LINKER_FLAGS \"${flag}\")
+     add_library(foo SHARED src_to_link.c)")
+  set(failed_regexes "[Ee]rror;[Uu]nknown;[Ss]kipping;LINK : warning")
+  set(base_dir ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/link_flag_check_${boolean})
+  file(MAKE_DIRECTORY ${base_dir})
+  file(MAKE_DIRECTORY ${base_dir}/build)
+  file(WRITE ${base_dir}/src_to_link.c "${library_source}")
+  file(WRITE ${base_dir}/CMakeLists.txt "${cmake_source}")
+
+  message(STATUS "Performing Test ${boolean}")
+  try_compile(
+    try_compile_result
+    ${base_dir}/build
+    ${base_dir}
+    foo
+    OUTPUT_VARIABLE OUTPUT)
+
+  if(try_compile_result)
+    foreach(regex IN LISTS failed_regexes)
+      if("${OUTPUT}" MATCHES ${regex})
+        set(retval FALSE)
+      endif()
+    endforeach()
+  else()
+    set(retval FALSE)
+  endif()
+
+  if(${retval})
+    set(${boolean} 1 CACHE INTERNAL "Test ${boolean}")
+    message(STATUS "Performing Test ${boolean} - Success")
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
+      "Performing C Linker Flag test ${boolean} succeeded with the following output:\n"
+      "${OUTPUT}\n"
+      "Source file was:\n${library_source}\n")
+  else()
+    set(${boolean} "" CACHE INTERNAL "Test ${boolean}")
+    message(STATUS "Performing Test ${boolean} - Failed")
+    file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
+      "Performing C Linker Flag test ${boolean} failed with the following output:\n"
+      "${OUTPUT}\n"
+      "Source file was:\n${library_source}\n")
+  endif()
+
+  set(${boolean} ${retval} PARENT_SCOPE)
+  endif()
+endfunction()
diff --git a/final/runtime/cmake/LibompDefinitions.cmake b/final/runtime/cmake/LibompDefinitions.cmake
new file mode 100644
index 0000000..46beec7
--- /dev/null
+++ b/final/runtime/cmake/LibompDefinitions.cmake
@@ -0,0 +1,30 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+function(libomp_get_definitions_flags cppflags)
+  set(cppflags_local)
+
+  if(WIN32)
+    libomp_append(cppflags_local "-D _CRT_SECURE_NO_WARNINGS")
+    libomp_append(cppflags_local "-D _CRT_SECURE_NO_DEPRECATE")
+    libomp_append(cppflags_local "-D _WINDOWS")
+    libomp_append(cppflags_local "-D _WINNT")
+    libomp_append(cppflags_local "-D _WIN32_WINNT=0x0501")
+    libomp_append(cppflags_local "-D _USRDLL")
+    libomp_append(cppflags_local "-D _ITERATOR_DEBUG_LEVEL=0" IF_TRUE DEBUG_BUILD)
+    libomp_append(cppflags_local "-D _DEBUG" IF_TRUE DEBUG_BUILD)
+  else()
+    libomp_append(cppflags_local "-D _GNU_SOURCE")
+    libomp_append(cppflags_local "-D _REENTRANT")
+  endif()
+
+  # CMake doesn't include CPPFLAGS from environment, but we will.
+  set(${cppflags} ${cppflags_local} ${LIBOMP_CPPFLAGS} $ENV{CPPFLAGS} PARENT_SCOPE)
+endfunction()
diff --git a/final/runtime/cmake/LibompExports.cmake b/final/runtime/cmake/LibompExports.cmake
new file mode 100644
index 0000000..f98de26
--- /dev/null
+++ b/final/runtime/cmake/LibompExports.cmake
@@ -0,0 +1,94 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# LibompExports.cmake
+#   Copy library and header files into the exports/ subdirectory after library build
+
+# Create the suffix for the export directory
+# - Only add to suffix when not a default value
+# - Example suffix: .deb.s1
+#   final export directory: exports/lin_32e.deb.s1/lib
+# - These suffixes imply the build is a Debug, Stats-Gathering version of the library
+set(libomp_suffix)
+libomp_append(libomp_suffix .deb DEBUG_BUILD)
+libomp_append(libomp_suffix .dia RELWITHDEBINFO_BUILD)
+libomp_append(libomp_suffix .min MINSIZEREL_BUILD)
+libomp_append(libomp_suffix .s1 LIBOMP_STATS)
+libomp_append(libomp_suffix .ompt LIBOMP_OMPT_SUPPORT)
+if(${LIBOMP_OMPT_SUPPORT})
+  libomp_append(libomp_suffix .optional LIBOMP_OMPT_OPTIONAL)
+endif()
+string(REPLACE ";" "" libomp_suffix "${libomp_suffix}")
+
+# Set exports locations
+if(${MIC})
+  set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_${LIBOMP_MIC_ARCH}") # e.g., lin_knf, lin_knc
+else()
+  if(${IA32})
+    set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_32")
+  elseif(${INTEL64})
+    set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_32e")
+  else()
+    set(libomp_platform "${LIBOMP_PERL_SCRIPT_OS}_${LIBOMP_ARCH}") # e.g., lin_arm, lin_ppc64
+  endif()
+endif()
+set(LIBOMP_EXPORTS_DIR "${LIBOMP_BASE_DIR}/exports")
+set(LIBOMP_EXPORTS_PLATFORM_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suffix}")
+set(LIBOMP_EXPORTS_CMN_DIR "${LIBOMP_EXPORTS_DIR}/common${libomp_suffix}/include")
+set(LIBOMP_EXPORTS_INC_DIR "${LIBOMP_EXPORTS_PLATFORM_DIR}/include")
+set(LIBOMP_EXPORTS_MOD_DIR "${LIBOMP_EXPORTS_PLATFORM_DIR}/include_compat")
+set(LIBOMP_EXPORTS_LIB_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suffix}/lib")
+
+# Put headers in exports/ directory post build
+add_custom_command(TARGET omp POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_CMN_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy omp.h ${LIBOMP_EXPORTS_CMN_DIR}
+)
+if(${LIBOMP_OMPT_SUPPORT})
+  add_custom_command(TARGET omp POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy omp-tools.h ${LIBOMP_EXPORTS_CMN_DIR}
+  )
+endif()
+if(${LIBOMP_FORTRAN_MODULES})
+  add_custom_command(TARGET libomp-mod POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_MOD_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib.mod ${LIBOMP_EXPORTS_MOD_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib_kinds.mod ${LIBOMP_EXPORTS_MOD_DIR}
+  )
+  add_custom_command(TARGET omp POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib.h ${LIBOMP_EXPORTS_CMN_DIR}
+  )
+endif()
+
+# Copy OpenMP library into exports/ directory post build
+if(WIN32)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
+else()
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+endif()
+if(NOT LIBOMP_OUTPUT_DIRECTORY)
+  set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+add_custom_command(TARGET omp POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
+)
+
+# Copy Windows import library into exports/ directory post build
+if(WIN32)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
+    set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+  add_custom_command(TARGET ompimp POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_LIB_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE} ${LIBOMP_EXPORTS_LIB_DIR}
+  )
+endif()
diff --git a/final/runtime/cmake/LibompGetArchitecture.cmake b/final/runtime/cmake/LibompGetArchitecture.cmake
new file mode 100644
index 0000000..e65cd30
--- /dev/null
+++ b/final/runtime/cmake/LibompGetArchitecture.cmake
@@ -0,0 +1,69 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Determine the architecture from predefined compiler macros
+# The architecture name can only contain alphanumeric characters and underscores (i.e., C identifier)
+
+# void get_architecture(string* return_arch)
+# - Returns the architecture in return_arch
+function(libomp_get_architecture return_arch)
+  set(detect_arch_src_txt "
+    #if defined(__KNC__)
+      #error ARCHITECTURE=mic
+    #elif defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+      #error ARCHITECTURE=x86_64
+    #elif defined(__i386) || defined(__i386__) || defined(__IA32__) || defined(_M_I86) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
+      #error ARCHITECTURE=i386
+    #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) ||  defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7S__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6ZK__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5E__)  || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_3M__)
+      #error ARCHITECTURE=arm
+    #elif defined(__ARM_ARCH_2__)
+      #error ARCHITECTURE=arm
+    #elif defined(__arm__) || defined(_M_ARM) || defined(_ARM)
+      #error ARCHITECTURE=arm
+    #elif defined(__aarch64__)
+      #error ARCHITECTURE=aarch64
+    #elif defined(__powerpc64__) && defined(__LITTLE_ENDIAN__)
+      #error ARCHITECTURE=ppc64le
+    #elif defined(__powerpc64__)
+      #error ARCHITECTURE=ppc64
+    #elif defined(__mips__) && defined(__mips64)
+      #error ARCHITECTURE=mips64
+    #elif defined(__mips__) && !defined(__mips64)
+      #error ARCHITECTURE=mips
+    #else
+      #error ARCHITECTURE=UnknownArchitecture
+    #endif
+  ")
+  # Write out ${detect_arch_src_txt} to a file within the cmake/ subdirectory
+  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c" ${detect_arch_src_txt})
+
+  # Try to compile using the C Compiler.  It will always error out with an #error directive, so store error output to ${local_architecture}
+  try_run(run_dummy compile_dummy "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c" COMPILE_OUTPUT_VARIABLE local_architecture)
+
+  # Match the important architecture line and store only that matching string in ${local_architecture}
+  string(REGEX MATCH "ARCHITECTURE=([a-zA-Z0-9_]+)" local_architecture "${local_architecture}")
+
+  # Get rid of the ARCHITECTURE= part of the string
+  string(REPLACE "ARCHITECTURE=" "" local_architecture "${local_architecture}")
+
+  # set the return value to the architecture detected (e.g., 32e, 32, arm, ppc64, etc.)
+  set(${return_arch} "${local_architecture}" PARENT_SCOPE)
+
+  # Remove ${detect_arch_src_txt} from cmake/ subdirectory
+  file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/libomp_detect_arch.c")
+endfunction()
diff --git a/final/runtime/cmake/LibompHandleFlags.cmake b/final/runtime/cmake/LibompHandleFlags.cmake
new file mode 100644
index 0000000..030e6f0
--- /dev/null
+++ b/final/runtime/cmake/LibompHandleFlags.cmake
@@ -0,0 +1,205 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Setup the flags correctly for cmake (covert to string)
+# Pretty them up (STRIP any beginning and trailing whitespace,
+# remove duplicates, remove empty entries)
+macro(libomp_setup_flags flags)
+  if(NOT "${${flags}}" STREQUAL "") # if flags are empty, don't do anything
+    set(flags_local)
+    list(REMOVE_DUPLICATES ${flags}) # remove duplicates
+    list(REMOVE_ITEM ${flags} "") # remove empty items
+    libomp_list_to_string("${${flags}}" flags_local)
+    string(STRIP "${flags_local}" flags_local)
+    set(${flags} "${flags_local}")
+  endif()
+endmacro()
+
+# Gets flags common to both the C and C++ compiler
+function(libomp_get_c_and_cxxflags_common flags)
+  set(flags_local)
+  libomp_append(flags_local -fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
+  libomp_append(flags_local -fno-rtti LIBOMP_HAVE_FNO_RTTI_FLAG)
+  if(${OPENMP_STANDALONE_BUILD})
+    libomp_append(flags_local -Wsign-compare LIBOMP_HAVE_WNO_SIGN_COMPARE_FLAG)
+    libomp_append(flags_local -Wunused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
+    libomp_append(flags_local -Wunused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
+    libomp_append(flags_local -Wunused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
+    libomp_append(flags_local -Wunused-variable LIBOMP_HAVE_WNO_UNUSED_VARIABLE_FLAG)
+    libomp_append(flags_local -Wdeprecated-register LIBOMP_HAVE_WNO_DEPRECATED_REGISTER_FLAG)
+    libomp_append(flags_local -Wunknown-pragmas LIBOMP_HAVE_WNO_UNKNOWN_PRAGMAS_FLAG)
+    libomp_append(flags_local -Wcomment LIBOMP_HAVE_WNO_COMMENT_FLAG)
+    libomp_append(flags_local -Wself-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
+    libomp_append(flags_local -Wformat-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+  endif()
+  libomp_append(flags_local -Wno-switch LIBOMP_HAVE_WNO_SWITCH_FLAG)
+  libomp_append(flags_local -Wno-covered-switch-default LIBOMP_HAVE_WNO_COVERED_SWITCH_DEFAULT_FLAG)
+  libomp_append(flags_local -Wno-gnu-anonymous-struct LIBOMP_HAVE_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+  libomp_append(flags_local -Wno-missing-field-initializers LIBOMP_HAVE_WNO_MISSING_FIELD_INITIALIZERS_FLAG)
+  libomp_append(flags_local -Wno-missing-braces LIBOMP_HAVE_WNO_MISSING_BRACES_FLAG)
+  libomp_append(flags_local -Wno-vla-extension LIBOMP_HAVE_WNO_VLA_EXTENSION_FLAG)
+  libomp_append(flags_local -Wstringop-overflow=0 LIBOMP_HAVE_WSTRINGOP_OVERFLOW_FLAG)
+  libomp_append(flags_local /GS LIBOMP_HAVE_GS_FLAG)
+  libomp_append(flags_local /EHsc LIBOMP_HAVE_EHSC_FLAG)
+  libomp_append(flags_local /Oy- LIBOMP_HAVE_OY__FLAG)
+  libomp_append(flags_local -mrtm LIBOMP_HAVE_MRTM_FLAG)
+  # Intel(R) C Compiler flags
+  libomp_append(flags_local /Qsafeseh LIBOMP_HAVE_QSAFESEH_FLAG)
+  libomp_append(flags_local -Qoption,cpp,--extended_float_types LIBOMP_HAVE_EXTENDED_FLOAT_TYPES_FLAG)
+  libomp_append(flags_local -Qlong_double LIBOMP_HAVE_LONG_DOUBLE_FLAG)
+  libomp_append(flags_local -Qdiag-disable:177 LIBOMP_HAVE_DIAG_DISABLE_177_FLAG)
+  if(${RELEASE_BUILD} OR ${RELWITHDEBINFO_BUILD})
+    libomp_append(flags_local -Qinline-min-size=1 LIBOMP_HAVE_INLINE_MIN_SIZE_FLAG)
+  endif()
+  # Architectural C and C++ flags
+  if(${IA32})
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      libomp_append(flags_local -m32 LIBOMP_HAVE_M32_FLAG)
+    endif()
+    libomp_append(flags_local /arch:SSE2 LIBOMP_HAVE_ARCH_SSE2_FLAG)
+    libomp_append(flags_local -msse2 LIBOMP_HAVE_MSSE2_FLAG)
+    libomp_append(flags_local -falign-stack=maintain-16-byte LIBOMP_HAVE_FALIGN_STACK_FLAG)
+  elseif(${MIC})
+    libomp_append(flags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+    libomp_append(flags_local -ftls-model=initial-exec LIBOMP_HAVE_FTLS_MODEL_FLAG)
+    libomp_append(flags_local "-opt-streaming-stores never" LIBOMP_HAVE_OPT_STREAMING_STORES_FLAG)
+  endif()
+  set(${flags} ${flags_local} PARENT_SCOPE)
+endfunction()
+
+# C compiler flags
+function(libomp_get_cflags cflags)
+  set(cflags_local)
+  libomp_get_c_and_cxxflags_common(cflags_local)
+  # flags only for the C Compiler
+  libomp_append(cflags_local /TP LIBOMP_HAVE_TP_FLAG)
+  libomp_append(cflags_local "-x c++" LIBOMP_HAVE_X_CPP_FLAG)
+  set(cflags_local ${cflags_local} ${LIBOMP_CFLAGS})
+  libomp_setup_flags(cflags_local)
+  set(${cflags} ${cflags_local} PARENT_SCOPE)
+endfunction()
+
+# C++ compiler flags
+function(libomp_get_cxxflags cxxflags)
+  set(cxxflags_local)
+  libomp_get_c_and_cxxflags_common(cxxflags_local)
+  if(${OPENMP_STANDALONE_BUILD})
+      libomp_append(cxxflags_local -Wcast-qual LIBOMP_HAVE_WCAST_QUAL_FLAG)
+  endif()
+  set(cxxflags_local ${cxxflags_local} ${LIBOMP_CXXFLAGS})
+  libomp_setup_flags(cxxflags_local)
+  set(${cxxflags} ${cxxflags_local} PARENT_SCOPE)
+endfunction()
+
+# Assembler flags
+function(libomp_get_asmflags asmflags)
+  set(asmflags_local)
+  libomp_append(asmflags_local "-x assembler-with-cpp" LIBOMP_HAVE_X_ASSEMBLER_WITH_CPP_FLAG)
+  # Architectural assembler flags
+  if(${IA32})
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      libomp_append(asmflags_local -m32 LIBOMP_HAVE_M32_FLAG)
+    endif()
+    libomp_append(asmflags_local /safeseh LIBOMP_HAVE_SAFESEH_MASM_FLAG)
+    libomp_append(asmflags_local /coff LIBOMP_HAVE_COFF_MASM_FLAG)
+  elseif(${MIC})
+    libomp_append(asmflags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+  endif()
+  set(asmflags_local ${asmflags_local} ${LIBOMP_ASMFLAGS})
+  libomp_setup_flags(asmflags_local)
+  set(${asmflags} ${asmflags_local} PARENT_SCOPE)
+endfunction()
+
+# Linker flags
+function(libomp_get_ldflags ldflags)
+  set(ldflags_local)
+  libomp_append(ldflags_local "${CMAKE_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_LIB_NAME}.def"
+    IF_DEFINED CMAKE_LINK_DEF_FILE_FLAG)
+  libomp_append(ldflags_local "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
+    IF_DEFINED CMAKE_C_OSX_CURRENT_VERSION_FLAG)
+  libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
+    IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
+  libomp_append(ldflags_local -Wl,--warn-shared-textrel LIBOMP_HAVE_WARN_SHARED_TEXTREL_FLAG)
+  libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
+  libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
+  libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
+  libomp_append(ldflags_local -Wl,-fini=__kmp_internal_end_fini LIBOMP_HAVE_FINI_FLAG)
+  libomp_append(ldflags_local -no-intel-extensions LIBOMP_HAVE_NO_INTEL_EXTENSIONS_FLAG)
+  libomp_append(ldflags_local -static-intel LIBOMP_HAVE_STATIC_INTEL_FLAG)
+  libomp_append(ldflags_local /SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
+  # Architectural linker flags
+  if(${IA32})
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      libomp_append(ldflags_local -m32 LIBOMP_HAVE_M32_FLAG)
+    endif()
+    libomp_append(ldflags_local -msse2 LIBOMP_HAVE_MSSE2_FLAG)
+  elseif(${MIC})
+    libomp_append(ldflags_local -mmic LIBOMP_HAVE_MMIC_FLAG)
+    libomp_append(ldflags_local -Wl,-x LIBOMP_HAVE_X_FLAG)
+  endif()
+  set(ldflags_local ${ldflags_local} ${LIBOMP_LDFLAGS})
+  libomp_setup_flags(ldflags_local)
+  set(${ldflags} ${ldflags_local} PARENT_SCOPE)
+endfunction()
+
+# Library flags
+function(libomp_get_libflags libflags)
+  set(libflags_local)
+  libomp_append(libflags_local "${CMAKE_THREAD_LIBS_INIT}")
+  libomp_append(libflags_local "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
+  if(${IA32})
+    libomp_append(libflags_local -lirc_pic LIBOMP_HAVE_IRC_PIC_LIBRARY)
+  endif()
+  if(${CMAKE_SYSTEM_NAME} MATCHES "DragonFly")
+    libomp_append(libflags_local "-Wl,--no-as-needed" LIBOMP_HAVE_AS_NEEDED_FLAG)
+    libomp_append(libflags_local "-lm")
+    libomp_append(libflags_local "-Wl,--as-needed" LIBOMP_HAVE_AS_NEEDED_FLAG)
+  elseif(${CMAKE_SYSTEM_NAME} MATCHES "(Free|Net)BSD")
+    libomp_append(libflags_local -lm)
+  endif()
+  set(libflags_local ${libflags_local} ${LIBOMP_LIBFLAGS})
+  libomp_setup_flags(libflags_local)
+  set(${libflags} ${libflags_local} PARENT_SCOPE)
+endfunction()
+
+# Fortran flags
+function(libomp_get_fflags fflags)
+  set(fflags_local)
+  if(${IA32})
+    libomp_append(fflags_local -m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
+  endif()
+  set(fflags_local ${fflags_local} ${LIBOMP_FFLAGS})
+  libomp_setup_flags(fflags_local)
+  set(${fflags} ${fflags_local} PARENT_SCOPE)
+endfunction()
+
+# Perl generate-defs.pl flags (For Windows only)
+function(libomp_get_gdflags gdflags)
+  set(gdflags_local)
+  if(${IA32})
+    set(libomp_gdflag_arch arch_32)
+  elseif(${INTEL64})
+    set(libomp_gdflag_arch arch_32e)
+  else()
+    set(libomp_gdflag_arch arch_${LIBOMP_ARCH})
+  endif()
+  libomp_append(gdflags_local "-D ${libomp_gdflag_arch}")
+  libomp_append(gdflags_local "-D msvc_compat")
+  libomp_append(gdflags_local "-D norm" NORMAL_LIBRARY)
+  libomp_append(gdflags_local "-D prof" PROFILE_LIBRARY)
+  libomp_append(gdflags_local "-D stub" STUBS_LIBRARY)
+  libomp_append(gdflags_local "-D HAVE_QUAD" LIBOMP_USE_QUAD_PRECISION)
+  libomp_append(gdflags_local "-D USE_DEBUGGER" LIBOMP_USE_DEBUGGER)
+  if(${DEBUG_BUILD} OR ${RELWITHDEBINFO_BUILD})
+    libomp_append(gdflags_local "-D KMP_DEBUG")
+  endif()
+  set(${gdflags} ${gdflags_local} PARENT_SCOPE)
+endfunction()
diff --git a/final/runtime/cmake/LibompMicroTests.cmake b/final/runtime/cmake/LibompMicroTests.cmake
new file mode 100644
index 0000000..2fde724
--- /dev/null
+++ b/final/runtime/cmake/LibompMicroTests.cmake
@@ -0,0 +1,230 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# The following micro-tests are small tests to perform on the library just created.
+# There are currently five micro-tests:
+# (1) test-touch
+#  - Compile and run a small program using newly created libomp library
+#  - Fails if test-touch.c does not compile or if test-touch.c does not run after compilation
+#  - Program dependencies: gcc or g++, grep, bourne shell
+#  - Available for all Unix,Mac,Windows builds.  Not available on Intel(R) MIC Architecture builds.
+# (2) test-relo
+#  - Tests dynamic libraries for position-dependent code (can not have any position dependent code)
+#  - Fails if TEXTREL is in output of readelf -d libomp.so command
+#  - Program dependencies: readelf, grep, bourne shell
+#  - Available for Unix, Intel(R) MIC Architecture dynamic library builds. Not available otherwise.
+# (3) test-execstack
+#  - Tests if stack is executable
+#  - Fails if stack is executable. Should only be readable and writable. Not exectuable.
+#  - Program dependencies: perl, readelf
+#  - Available for Unix dynamic library builds. Not available otherwise.
+# (4) test-instr (Intel(R) MIC Architecutre only)
+#  - Tests Intel(R) MIC Architecture libraries for valid instruction set
+#  - Fails if finds invalid instruction for Intel(R) MIC Architecture (wasn't compiled with correct flags)
+#  - Program dependencies: perl, objdump
+#  - Available for Intel(R) MIC Architecture and i386 builds. Not available otherwise.
+# (5) test-deps
+#  - Tests newly created libomp for library dependencies
+#  - Fails if sees a dependence not listed in td_exp variable below
+#  - Program dependencies: perl, (unix)readelf, (mac)otool[64], (windows)link.exe
+#  - Available for Unix,Mac,Windows, Intel(R) MIC Architecture dynamic builds and Windows
+#    static builds. Not available otherwise.
+
+# get library location
+if(WIN32)
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp RUNTIME_OUTPUT_DIRECTORY)
+  get_target_property(LIBOMPIMP_OUTPUT_DIRECTORY ompimp ARCHIVE_OUTPUT_DIRECTORY)
+  if(NOT LIBOMPIMP_OUTPUT_DIRECTORY)
+    set(LIBOMPIMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  endif()
+else()
+  get_target_property(LIBOMP_OUTPUT_DIRECTORY omp LIBRARY_OUTPUT_DIRECTORY)
+endif()
+if(NOT LIBOMP_OUTPUT_DIRECTORY)
+  set(LIBOMP_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+# test-touch
+find_program(LIBOMP_SHELL sh)
+if(WIN32)
+  if(LIBOMP_SHELL)
+    set(libomp_test_touch_targets test-touch-md/.success test-touch-mt/.success)
+  endif()
+  # pick test-touch compiler
+  set(libomp_test_touch_compiler ${CMAKE_C_COMPILER})
+  # test-touch compilation flags
+  libomp_append(libomp_test_touch_cflags /nologo)
+  libomp_append(libomp_test_touch_libs ${LIBOMPIMP_OUTPUT_DIRECTORY}/${LIBOMP_IMP_LIB_FILE})
+  if(${IA32})
+    libomp_append(libomp_test_touch_ldflags /safeseh)
+  endif()
+else() # (Unix based systems, Intel(R) MIC Architecture, and Mac)
+  if(LIBOMP_SHELL)
+    set(libomp_test_touch_targets test-touch-rt/.success)
+  endif()
+  # pick test-touch compiler
+  if(${LIBOMP_USE_STDCPPLIB})
+    set(libomp_test_touch_compiler ${CMAKE_CXX_COMPILER})
+  else()
+    set(libomp_test_touch_compiler ${CMAKE_C_COMPILER})
+  endif()
+  # test-touch compilation flags
+  libomp_append(libomp_test_touch_libs "${CMAKE_THREAD_LIBS_INIT}")
+  if(${IA32})
+    libomp_append(libomp_test_touch_cflags -m32 LIBOMP_HAVE_M32_FLAG)
+  endif()
+  libomp_append(libomp_test_touch_libs ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE})
+  libomp_append(libomp_test_touch_libs "${LIBOMP_HWLOC_LIBRARY}" LIBOMP_USE_HWLOC)
+  if(APPLE)
+    set(libomp_test_touch_env "DYLD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{DYLD_LIBRARY_PATH}")
+    libomp_append(libomp_test_touch_ldflags "-Wl,-rpath,${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
+  else()
+    set(libomp_test_touch_env "LD_LIBRARY_PATH=.:${LIBOMP_OUTPUT_DIRECTORY}:$ENV{LD_LIBRARY_PATH}")
+    libomp_append(libomp_test_touch_ldflags "-Wl,-rpath=${LIBOMP_HWLOC_LIBRARY_DIR}" LIBOMP_USE_HWLOC)
+  endif()
+endif()
+macro(libomp_test_touch_recipe test_touch_dir)
+  set(libomp_test_touch_dependencies ${LIBOMP_SRC_DIR}/test-touch.c omp)
+  set(libomp_test_touch_exe ${test_touch_dir}/test-touch${CMAKE_EXECUTABLE_SUFFIX})
+  set(libomp_test_touch_obj ${test_touch_dir}/test-touch${CMAKE_C_OUTPUT_EXTENSION})
+  if(WIN32)
+    if(${RELEASE_BUILD} OR ${RELWITHDEBINFO_BUILD})
+      if(${test_touch_dir} MATCHES "test-touch-mt")
+        libomp_append(libomp_test_touch_cflags /MT)
+      else()
+        libomp_append(libomp_test_touch_cflags /MD)
+      endif()
+    else()
+      if(${test_touch_dir} MATCHES "test-touch-mt")
+        libomp_append(libomp_test_touch_cflags /MTd)
+      else()
+        libomp_append(libomp_test_touch_cflags /MDd)
+      endif()
+    endif()
+    set(libomp_test_touch_out_flags -Fe${libomp_test_touch_exe} -Fo${libomp_test_touch_obj})
+    list(APPEND libomp_test_touch_dependencies ompimp)
+  else()
+    set(libomp_test_touch_out_flags -o ${libomp_test_touch_exe})
+  endif()
+  add_custom_command(
+    OUTPUT  ${test_touch_dir}/.success ${libomp_test_touch_exe} ${libomp_test_touch_obj}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/${test_touch_dir}
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${test_touch_dir}/*
+    COMMAND ${libomp_test_touch_compiler} ${libomp_test_touch_out_flags} ${libomp_test_touch_cflags}
+      ${LIBOMP_SRC_DIR}/test-touch.c ${libomp_test_touch_ldflags} ${libomp_test_touch_libs}
+    COMMAND ${LIBOMP_SHELL} -c \"${libomp_test_touch_env} ${libomp_test_touch_exe}\"
+    COMMAND ${CMAKE_COMMAND} -E touch ${test_touch_dir}/.success
+    DEPENDS ${libomp_test_touch_dependencies}
+  )
+endmacro()
+libomp_append(libomp_test_touch_env "KMP_VERSION=1")
+add_custom_target(libomp-test-touch DEPENDS ${libomp_test_touch_targets})
+if(WIN32)
+  libomp_test_touch_recipe(test-touch-mt)
+  libomp_test_touch_recipe(test-touch-md)
+else()
+  libomp_test_touch_recipe(test-touch-rt)
+endif()
+
+# test-relo
+add_custom_target(libomp-test-relo DEPENDS test-relo/.success)
+add_custom_command(
+  OUTPUT  test-relo/.success test-relo/readelf.log
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-relo
+  COMMAND readelf -d ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE} > test-relo/readelf.log
+  COMMAND grep -e TEXTREL test-relo/readelf.log \; test $$? -eq 1
+  COMMAND ${CMAKE_COMMAND} -E touch test-relo/.success
+  DEPENDS omp
+)
+
+# test-execstack
+add_custom_target(libomp-test-execstack DEPENDS test-execstack/.success)
+add_custom_command(
+  OUTPUT  test-execstack/.success
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-execstack
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-execstack.pl
+    --arch=${LIBOMP_PERL_SCRIPT_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+  COMMAND ${CMAKE_COMMAND} -E touch test-execstack/.success
+  DEPENDS omp
+)
+
+# test-instr
+add_custom_target(libomp-test-instr DEPENDS test-instr/.success)
+add_custom_command(
+  OUTPUT  test-instr/.success
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-instr
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --arch=${LIBOMP_PERL_SCRIPT_ARCH} --show --mic-arch=${LIBOMP_MIC_ARCH} ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+  COMMAND ${CMAKE_COMMAND} -E touch test-instr/.success
+  DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-instruction-set.pl
+)
+
+# test-deps
+add_custom_target(libomp-test-deps DEPENDS test-deps/.success)
+set(libomp_expected_library_deps)
+if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+  set(libomp_expected_library_deps libc.so.7 libthr.so.3 libm.so.5)
+  libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+elseif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+  set(libomp_expected_library_deps libc.so.12 libpthread.so.1 libm.so.0)
+  libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+elseif(CMAKE_SYSTEM_NAME MATCHES "DragonFly")
+  set(libomp_expected_library_deps libc.so.8 libpthread.so.0 libm.so.4)
+  libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+elseif(APPLE)
+  set(libomp_expected_library_deps /usr/lib/libSystem.B.dylib)
+elseif(WIN32)
+  set(libomp_expected_library_deps kernel32.dll)
+  libomp_append(libomp_expected_library_deps psapi.dll LIBOMP_OMPT_SUPPORT)
+else()
+  if(${MIC})
+    set(libomp_expected_library_deps libc.so.6 libpthread.so.0 libdl.so.2)
+    if("${LIBOMP_MIC_ARCH}" STREQUAL "knf")
+      libomp_append(libomp_expected_library_deps ld-linux-l1om.so.2)
+      libomp_append(libomp_expected_library_deps libgcc_s.so.1)
+    elseif("${LIBOMP_MIC_ARCH}" STREQUAL "knc")
+      libomp_append(libomp_expected_library_deps ld-linux-k1om.so.2)
+    endif()
+  else()
+    set(libomp_expected_library_deps libdl.so.2 libgcc_s.so.1)
+    if(${IA32})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld-linux.so.2)
+    elseif(${INTEL64})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld-linux-x86-64.so.2)
+    elseif(${ARM})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps libffi.so.6)
+      libomp_append(libomp_expected_library_deps libffi.so.5)
+      libomp_append(libomp_expected_library_deps ld-linux-armhf.so.3)
+    elseif(${PPC64})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld64.so.1)
+    elseif(${MIPS} OR ${MIPS64})
+      libomp_append(libomp_expected_library_deps libc.so.6)
+      libomp_append(libomp_expected_library_deps ld.so.1)
+    endif()
+    libomp_append(libomp_expected_library_deps libpthread.so.0 IF_FALSE STUBS_LIBRARY)
+    libomp_append(libomp_expected_library_deps libhwloc.so.5 LIBOMP_USE_HWLOC)
+  endif()
+  libomp_append(libomp_expected_library_deps libstdc++.so.6 LIBOMP_USE_STDCPPLIB)
+  libomp_append(libomp_expected_library_deps libm.so.6 LIBOMP_STATS)
+endif()
+# Perl script expects comma separated list
+string(REPLACE ";" "," libomp_expected_library_deps "${libomp_expected_library_deps}")
+add_custom_command(
+  OUTPUT  test-deps/.success
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/test-deps
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/check-depends.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --arch=${LIBOMP_PERL_SCRIPT_ARCH} --expected="${libomp_expected_library_deps}" ${LIBOMP_OUTPUT_DIRECTORY}/${LIBOMP_LIB_FILE}
+  COMMAND ${CMAKE_COMMAND} -E touch test-deps/.success
+  DEPENDS omp ${LIBOMP_TOOLS_DIR}/check-depends.pl
+)
diff --git a/final/runtime/cmake/LibompUtils.cmake b/final/runtime/cmake/LibompUtils.cmake
new file mode 100644
index 0000000..179c8d0
--- /dev/null
+++ b/final/runtime/cmake/LibompUtils.cmake
@@ -0,0 +1,194 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# void libomp_say(string message_to_user);
+# - prints out message_to_user
+macro(libomp_say message_to_user)
+  message(STATUS "LIBOMP: ${message_to_user}")
+endmacro()
+
+# void libomp_warning_say(string message_to_user);
+# - prints out message_to_user with a warning
+macro(libomp_warning_say message_to_user)
+  message(WARNING "LIBOMP: ${message_to_user}")
+endmacro()
+
+# void libomp_error_say(string message_to_user);
+# - prints out message_to_user with an error and exits cmake
+macro(libomp_error_say message_to_user)
+  message(FATAL_ERROR "LIBOMP: ${message_to_user}")
+endmacro()
+
+# libomp_append(<flag> <flags_list> [(IF_TRUE | IF_FALSE | IF_TRUE_1_0 ) BOOLEAN])
+#
+# libomp_append(<flag> <flags_list>)
+#   - unconditionally appends <flag> to the list of definitions
+#
+# libomp_append(<flag> <flags_list> <BOOLEAN>)
+#   - appends <flag> to the list of definitions if BOOLEAN is true
+#
+# libomp_append(<flag> <flags_list> IF_TRUE <BOOLEAN>)
+#   - appends <flag> to the list of definitions if BOOLEAN is true
+#
+# libomp_append(<flag> <flags_list> IF_FALSE <BOOLEAN>)
+#   - appends <flag> to the list of definitions if BOOLEAN is false
+#
+# libomp_append(<flag> <flags_list> IF_DEFINED <VARIABLE>)
+#   - appends <flag> to the list of definitions if VARIABLE is defined
+#
+# libomp_append(<flag> <flags_list> IF_TRUE_1_0 <BOOLEAN>)
+#   - appends <flag>=1 to the list of definitions if <BOOLEAN> is true, <flag>=0 otherwise
+# e.g., libomp_append("-D USE_FEATURE" IF_TRUE_1_0 HAVE_FEATURE)
+#     appends "-D USE_FEATURE=1" if HAVE_FEATURE is true
+#     or "-D USE_FEATURE=0" if HAVE_FEATURE is false
+macro(libomp_append flags flag)
+  if(NOT (${ARGC} EQUAL 2 OR ${ARGC} EQUAL 3 OR ${ARGC} EQUAL 4))
+    libomp_error_say("libomp_append: takes 2, 3, or 4 arguments")
+  endif()
+  if(${ARGC} EQUAL 2)
+    list(APPEND ${flags} "${flag}")
+  elseif(${ARGC} EQUAL 3)
+    if(${ARGV2})
+      list(APPEND ${flags} "${flag}")
+    endif()
+  else()
+    if(${ARGV2} STREQUAL "IF_TRUE")
+      if(${ARGV3})
+        list(APPEND ${flags} "${flag}")
+      endif()
+    elseif(${ARGV2} STREQUAL "IF_FALSE")
+      if(NOT ${ARGV3})
+        list(APPEND ${flags} "${flag}")
+      endif()
+    elseif(${ARGV2} STREQUAL "IF_DEFINED")
+      if(DEFINED ${ARGV3})
+        list(APPEND ${flags} "${flag}")
+      endif()
+    elseif(${ARGV2} STREQUAL "IF_TRUE_1_0")
+      if(${ARGV3})
+        list(APPEND ${flags} "${flag}=1")
+      else()
+        list(APPEND ${flags} "${flag}=0")
+      endif()
+    else()
+      libomp_error_say("libomp_append: third argument must be one of IF_TRUE, IF_FALSE, IF_DEFINED, IF_TRUE_1_0")
+    endif()
+  endif()
+endmacro()
+
+# void libomp_get_legal_arch(string* return_arch_string);
+# - returns (through return_arch_string) the formal architecture
+#   string or warns user of unknown architecture
+function(libomp_get_legal_arch return_arch_string)
+  if(${IA32})
+    set(${return_arch_string} "IA-32" PARENT_SCOPE)
+  elseif(${INTEL64})
+    set(${return_arch_string} "Intel(R) 64" PARENT_SCOPE)
+  elseif(${MIC})
+    set(${return_arch_string} "Intel(R) Many Integrated Core Architecture" PARENT_SCOPE)
+  elseif(${ARM})
+    set(${return_arch_string} "ARM" PARENT_SCOPE)
+  elseif(${PPC64BE})
+    set(${return_arch_string} "PPC64BE" PARENT_SCOPE)
+  elseif(${PPC64LE})
+    set(${return_arch_string} "PPC64LE" PARENT_SCOPE)
+  elseif(${AARCH64})
+    set(${return_arch_string} "AARCH64" PARENT_SCOPE)
+  elseif(${MIPS})
+    set(${return_arch_string} "MIPS" PARENT_SCOPE)
+  elseif(${MIPS64})
+    set(${return_arch_string} "MIPS64" PARENT_SCOPE)
+  else()
+    set(${return_arch_string} "${LIBOMP_ARCH}" PARENT_SCOPE)
+    libomp_warning_say("libomp_get_legal_arch(): Warning: Unknown architecture: Using ${LIBOMP_ARCH}")
+  endif()
+endfunction()
+
+# void libomp_check_variable(string var, ...);
+# - runs through all values checking if ${var} == value
+# - uppercase and lowercase do not matter
+# - if the var is found, then just print it out
+# - if the var is not found, then error out
+function(libomp_check_variable var)
+  set(valid_flag 0)
+  string(TOLOWER "${${var}}" var_lower)
+  foreach(value IN LISTS ARGN)
+    string(TOLOWER "${value}" value_lower)
+    if("${var_lower}" STREQUAL "${value_lower}")
+      set(valid_flag 1)
+      set(the_value "${value}")
+    endif()
+  endforeach()
+  if(${valid_flag} EQUAL 0)
+    libomp_error_say("libomp_check_variable(): ${var} = ${${var}} is unknown")
+  endif()
+endfunction()
+
+# void libomp_get_build_number(string src_dir, string* return_build_number);
+# - grab the eight digit build number (or 00000000) from kmp_version.cpp
+function(libomp_get_build_number src_dir return_build_number)
+  # sets file_lines_list to a list of all lines in kmp_version.cpp
+  file(STRINGS "${src_dir}/src/kmp_version.cpp" file_lines_list)
+
+  # runs through each line in kmp_version.cpp
+  foreach(line IN LISTS file_lines_list)
+    # if the line begins with "#define KMP_VERSION_BUILD" then we take not of the build number
+    string(REGEX MATCH "^[ \t]*#define[ \t]+KMP_VERSION_BUILD" valid "${line}")
+    if(NOT "${valid}" STREQUAL "") # if we matched "#define KMP_VERSION_BUILD", then grab the build number
+      string(REGEX REPLACE "^[ \t]*#define[ \t]+KMP_VERSION_BUILD[ \t]+([0-9]+)" "\\1"
+           build_number "${line}"
+      )
+    endif()
+  endforeach()
+  set(${return_build_number} "${build_number}" PARENT_SCOPE) # return build number
+endfunction()
+
+# void libomp_get_legal_type(string* return_legal_type);
+# - set the legal type name Performance/Profiling/Stub
+function(libomp_get_legal_type return_legal_type)
+  if(${NORMAL_LIBRARY})
+    set(${return_legal_type} "Performance" PARENT_SCOPE)
+  elseif(${PROFILE_LIBRARY})
+    set(${return_legal_type} "Profiling" PARENT_SCOPE)
+  elseif(${STUBS_LIBRARY})
+    set(${return_legal_type} "Stub" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# void libomp_add_suffix(string suffix, list<string>* list_of_items);
+# - returns list_of_items with suffix appended to all items
+# - original list is modified
+function(libomp_add_suffix suffix list_of_items)
+  set(local_list "")
+  foreach(item IN LISTS "${list_of_items}")
+    if(NOT "${item}" STREQUAL "")
+      list(APPEND local_list "${item}${suffix}")
+    endif()
+  endforeach()
+  set(${list_of_items} "${local_list}" PARENT_SCOPE)
+endfunction()
+
+# void libomp_list_to_string(list<string> list_of_things, string* return_string);
+# - converts a list to a space separated string
+function(libomp_list_to_string list_of_things return_string)
+  string(REPLACE ";" " " output_variable "${list_of_things}")
+  set(${return_string} "${output_variable}" PARENT_SCOPE)
+endfunction()
+
+# void libomp_string_to_list(string str, list<string>* return_list);
+# - converts a string to a semicolon separated list
+# - what it really does is just string_replace all running whitespace to a semicolon
+# - in cmake, a list is strings separated by semicolons: i.e., list of four items, list = "item1;item2;item3;item4"
+function(libomp_string_to_list str return_list)
+  set(outstr)
+  string(REGEX REPLACE "[ \t]+" ";" outstr "${str}")
+  set(${return_list} "${outstr}" PARENT_SCOPE)
+endfunction()
+
diff --git a/final/runtime/cmake/config-ix.cmake b/final/runtime/cmake/config-ix.cmake
new file mode 100644
index 0000000..5404715
--- /dev/null
+++ b/final/runtime/cmake/config-ix.cmake
@@ -0,0 +1,284 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+include(CheckCCompilerFlag)
+include(CheckCSourceCompiles)
+include(CheckCXXCompilerFlag)
+include(CheckIncludeFile)
+include(CheckLibraryExists)
+include(CheckIncludeFiles)
+include(LibompCheckLinkerFlag)
+include(LibompCheckFortranFlag)
+
+# Check for versioned symbols
+function(libomp_check_version_symbols retval)
+  set(source_code
+    "#include <stdio.h>
+    void func1() { printf(\"Hello\"); }
+    void func2() { printf(\"World\"); }
+    __asm__(\".symver func1, func@VER1\");
+    __asm__(\".symver func2, func@VER2\");
+    int main() {
+      func1();
+      func2();
+      return 0;
+    }")
+  set(version_script_source "VER1 { }; VER2 { } VER1;")
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt "${version_script_source}")
+  set(CMAKE_REQUIRED_FLAGS -Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt)
+  check_c_source_compiles("${source_code}" ${retval})
+  set(${retval} ${${retval}} PARENT_SCOPE)
+  file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/__version_script.txt)
+endfunction()
+
+# Includes the architecture flag in both compile and link phase
+function(libomp_check_architecture_flag flag retval)
+  set(CMAKE_REQUIRED_FLAGS "${flag}")
+  check_c_compiler_flag("${flag}" ${retval})
+  set(${retval} ${${retval}} PARENT_SCOPE)
+endfunction()
+
+# Checking C, CXX, Linker Flags
+check_cxx_compiler_flag(-fno-exceptions LIBOMP_HAVE_FNO_EXCEPTIONS_FLAG)
+check_cxx_compiler_flag(-fno-rtti LIBOMP_HAVE_FNO_RTTI_FLAG)
+check_c_compiler_flag("-x c++" LIBOMP_HAVE_X_CPP_FLAG)
+check_cxx_compiler_flag(-Wcast-qual LIBOMP_HAVE_WCAST_QUAL_FLAG)
+check_c_compiler_flag(-Wunused-function LIBOMP_HAVE_WNO_UNUSED_FUNCTION_FLAG)
+check_c_compiler_flag(-Wunused-local-typedef LIBOMP_HAVE_WNO_UNUSED_LOCAL_TYPEDEF_FLAG)
+check_c_compiler_flag(-Wunused-value LIBOMP_HAVE_WNO_UNUSED_VALUE_FLAG)
+check_c_compiler_flag(-Wunused-variable LIBOMP_HAVE_WNO_UNUSED_VARIABLE_FLAG)
+check_c_compiler_flag(-Wswitch LIBOMP_HAVE_WNO_SWITCH_FLAG)
+check_c_compiler_flag(-Wcovered-switch-default LIBOMP_HAVE_WNO_COVERED_SWITCH_DEFAULT_FLAG)
+check_c_compiler_flag(-Wdeprecated-register LIBOMP_HAVE_WNO_DEPRECATED_REGISTER_FLAG)
+check_c_compiler_flag(-Wsign-compare LIBOMP_HAVE_WNO_SIGN_COMPARE_FLAG)
+check_c_compiler_flag(-Wgnu-anonymous-struct LIBOMP_HAVE_WNO_GNU_ANONYMOUS_STRUCT_FLAG)
+check_c_compiler_flag(-Wunknown-pragmas LIBOMP_HAVE_WNO_UNKNOWN_PRAGMAS_FLAG)
+check_c_compiler_flag(-Wmissing-field-initializers LIBOMP_HAVE_WNO_MISSING_FIELD_INITIALIZERS_FLAG)
+check_c_compiler_flag(-Wmissing-braces LIBOMP_HAVE_WNO_MISSING_BRACES_FLAG)
+check_c_compiler_flag(-Wcomment LIBOMP_HAVE_WNO_COMMENT_FLAG)
+check_c_compiler_flag(-Wself-assign LIBOMP_HAVE_WNO_SELF_ASSIGN_FLAG)
+check_c_compiler_flag(-Wvla-extension LIBOMP_HAVE_WNO_VLA_EXTENSION_FLAG)
+check_c_compiler_flag(-Wformat-pedantic LIBOMP_HAVE_WNO_FORMAT_PEDANTIC_FLAG)
+check_c_compiler_flag(-Wstringop-overflow=0 LIBOMP_HAVE_WSTRINGOP_OVERFLOW_FLAG)
+check_c_compiler_flag(-msse2 LIBOMP_HAVE_MSSE2_FLAG)
+check_c_compiler_flag(-ftls-model=initial-exec LIBOMP_HAVE_FTLS_MODEL_FLAG)
+libomp_check_architecture_flag(-mmic LIBOMP_HAVE_MMIC_FLAG)
+libomp_check_architecture_flag(-m32 LIBOMP_HAVE_M32_FLAG)
+if(WIN32)
+  if(MSVC)
+    # Check Windows MSVC style flags.
+    check_c_compiler_flag(/TP LIBOMP_HAVE_TP_FLAG)
+    check_cxx_compiler_flag(/EHsc LIBOMP_HAVE_EHSC_FLAG)
+    check_cxx_compiler_flag(/GS LIBOMP_HAVE_GS_FLAG)
+    check_cxx_compiler_flag(/Oy- LIBOMP_HAVE_Oy__FLAG)
+    check_cxx_compiler_flag(/arch:SSE2 LIBOMP_HAVE_ARCH_SSE2_FLAG)
+    check_cxx_compiler_flag(/Qsafeseh LIBOMP_HAVE_QSAFESEH_FLAG)
+  endif()
+  check_c_compiler_flag(-mrtm LIBOMP_HAVE_MRTM_FLAG)
+  # It is difficult to create a dummy masm assembly file
+  # and then check the MASM assembler to see if these flags exist and work,
+  # so we assume they do for Windows.
+  set(LIBOMP_HAVE_SAFESEH_MASM_FLAG TRUE)
+  set(LIBOMP_HAVE_COFF_MASM_FLAG TRUE)
+  # Change Windows flags /MDx to /MTx
+  foreach(libomp_lang IN ITEMS C CXX)
+    foreach(libomp_btype IN ITEMS DEBUG RELWITHDEBINFO RELEASE MINSIZEREL)
+      string(REPLACE "/MD" "/MT"
+        CMAKE_${libomp_lang}_FLAGS_${libomp_btype}
+        "${CMAKE_${libomp_lang}_FLAGS_${libomp_btype}}"
+      )
+    endforeach()
+  endforeach()
+else()
+  # It is difficult to create a dummy assembly file that compiles into an
+  # exectuable for every architecture and then check the C compiler to
+  # see if -x assembler-with-cpp exists and works, so we assume it does for non-Windows.
+  set(LIBOMP_HAVE_X_ASSEMBLER_WITH_CPP_FLAG TRUE)
+endif()
+if(${LIBOMP_FORTRAN_MODULES})
+  libomp_check_fortran_flag(-m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
+endif()
+
+# Check linker flags
+if(WIN32)
+  libomp_check_linker_flag(/SAFESEH LIBOMP_HAVE_SAFESEH_FLAG)
+elseif(NOT APPLE)
+  libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG)
+  libomp_check_linker_flag(-Wl,--warn-shared-textrel LIBOMP_HAVE_WARN_SHARED_TEXTREL_FLAG)
+  libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
+  libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
+  libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
+  libomp_check_linker_flag(-Wl,-fini=__kmp_internal_end_fini LIBOMP_HAVE_FINI_FLAG)
+endif()
+
+# Check Intel(R) C Compiler specific flags
+if(CMAKE_C_COMPILER_ID STREQUAL "Intel")
+  check_cxx_compiler_flag(/Qlong_double LIBOMP_HAVE_LONG_DOUBLE_FLAG)
+  check_cxx_compiler_flag(/Qdiag-disable:177 LIBOMP_HAVE_DIAG_DISABLE_177_FLAG)
+  check_cxx_compiler_flag(/Qinline-min-size=1 LIBOMP_HAVE_INLINE_MIN_SIZE_FLAG)
+  check_cxx_compiler_flag(-Qoption,cpp,--extended_float_types LIBOMP_HAVE_EXTENDED_FLOAT_TYPES_FLAG)
+  check_cxx_compiler_flag(-falign-stack=maintain-16-byte LIBOMP_HAVE_FALIGN_STACK_FLAG)
+  check_cxx_compiler_flag("-opt-streaming-stores never" LIBOMP_HAVE_OPT_STREAMING_STORES_FLAG)
+  libomp_check_linker_flag(-static-intel LIBOMP_HAVE_STATIC_INTEL_FLAG)
+  libomp_check_linker_flag(-no-intel-extensions LIBOMP_HAVE_NO_INTEL_EXTENSIONS_FLAG)
+  check_library_exists(irc_pic _intel_fast_memcpy "" LIBOMP_HAVE_IRC_PIC_LIBRARY)
+endif()
+
+# Checking Threading requirements
+find_package(Threads REQUIRED)
+if(WIN32)
+  if(NOT CMAKE_USE_WIN32_THREADS_INIT)
+    libomp_error_say("Need Win32 thread interface on Windows.")
+  endif()
+else()
+  if(NOT CMAKE_USE_PTHREADS_INIT)
+    libomp_error_say("Need pthread interface on Unix-like systems.")
+  endif()
+endif()
+
+# Find perl executable
+# Perl is used to create omp.h (and other headers) along with kmp_i18n_id.inc and kmp_i18n_default.inc
+find_package(Perl REQUIRED)
+# The perl scripts take the --os=/--arch= flags which expect a certain format for operating systems and arch's.
+# Until the perl scripts are removed, the most portable way to handle this is to have all operating systems that
+# are neither Windows nor Mac (Most Unix flavors) be considered lin to the perl scripts.  This is rooted
+# in that all the Perl scripts check the operating system and will fail if it isn't "valid".  This
+# temporary solution lets us avoid trying to enumerate all the possible OS values inside the Perl modules.
+if(WIN32)
+  set(LIBOMP_PERL_SCRIPT_OS win)
+elseif(APPLE)
+  set(LIBOMP_PERL_SCRIPT_OS mac)
+else()
+  set(LIBOMP_PERL_SCRIPT_OS lin)
+endif()
+if(IA32)
+  set(LIBOMP_PERL_SCRIPT_ARCH 32)
+elseif(MIC)
+  set(LIBOMP_PERL_SCRIPT_ARCH mic)
+elseif(INTEL64)
+  set(LIBOMP_PERL_SCRIPT_ARCH 32e)
+else()
+  set(LIBOMP_PERL_SCRIPT_ARCH ${LIBOMP_ARCH})
+endif()
+
+# Checking features
+# Check if version symbol assembler directives are supported
+libomp_check_version_symbols(LIBOMP_HAVE_VERSION_SYMBOLS)
+
+# Check if quad precision types are available
+if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+  set(LIBOMP_HAVE_QUAD_PRECISION TRUE)
+elseif(CMAKE_C_COMPILER_ID STREQUAL "Intel")
+  if(LIBOMP_HAVE_EXTENDED_FLOAT_TYPES_FLAG)
+    set(LIBOMP_HAVE_QUAD_PRECISION TRUE)
+  else()
+    set(LIBOMP_HAVE_QUAD_PRECISION TRUE)
+  endif()
+else()
+  set(LIBOMP_HAVE_QUAD_PRECISION FALSE)
+endif()
+
+# Check if adaptive locks are available
+if((${IA32} OR ${INTEL64}) AND NOT MSVC)
+  set(LIBOMP_HAVE_ADAPTIVE_LOCKS TRUE)
+else()
+  set(LIBOMP_HAVE_ADAPTIVE_LOCKS FALSE)
+endif()
+
+# Check if stats-gathering is available
+if(${LIBOMP_STATS})
+  check_c_source_compiles(
+     "__thread int x;
+     int main(int argc, char** argv)
+     { x = argc; return x; }"
+     LIBOMP_HAVE___THREAD)
+  check_c_source_compiles(
+     "int main(int argc, char** argv)
+     { unsigned long long t = __builtin_readcyclecounter(); return 0; }"
+     LIBOMP_HAVE___BUILTIN_READCYCLECOUNTER)
+  if(NOT LIBOMP_HAVE___BUILTIN_READCYCLECOUNTER)
+    if(${IA32} OR ${INTEL64} OR ${MIC})
+      check_include_file(x86intrin.h LIBOMP_HAVE_X86INTRIN_H)
+      libomp_append(CMAKE_REQUIRED_DEFINITIONS -DLIBOMP_HAVE_X86INTRIN_H LIBOMP_HAVE_X86INTRIN_H)
+      check_c_source_compiles(
+        "#ifdef LIBOMP_HAVE_X86INTRIN_H
+         # include <x86intrin.h>
+         #endif
+         int main(int argc, char** argv) { unsigned long long t = __rdtsc(); return 0; }" LIBOMP_HAVE___RDTSC)
+      set(CMAKE_REQUIRED_DEFINITIONS)
+    endif()
+  endif()
+  if(LIBOMP_HAVE___THREAD AND (LIBOMP_HAVE___RDTSC OR LIBOMP_HAVE___BUILTIN_READCYCLECOUNTER))
+    set(LIBOMP_HAVE_STATS TRUE)
+  else()
+    set(LIBOMP_HAVE_STATS FALSE)
+  endif()
+endif()
+
+# Check if OMPT support is available
+# Currently, __builtin_frame_address() is required for OMPT
+# Weak attribute is required for Unices (except Darwin), LIBPSAPI is used for Windows
+check_c_source_compiles("int main(int argc, char** argv) {
+  void* p = __builtin_frame_address(0);
+  return 0;}" LIBOMP_HAVE___BUILTIN_FRAME_ADDRESS)
+check_c_source_compiles("__attribute__ ((weak)) int foo(int a) { return a*a; }
+  int main(int argc, char** argv) {
+  return foo(argc);}" LIBOMP_HAVE_WEAK_ATTRIBUTE)
+check_include_files("windows.h;psapi.h" LIBOMP_HAVE_PSAPI_H)
+check_library_exists(psapi EnumProcessModules "" LIBOMP_HAVE_LIBPSAPI)
+if(LIBOMP_HAVE_PSAPI_H AND LIBOMP_HAVE_LIBPSAPI)
+  set(LIBOMP_HAVE_PSAPI TRUE)
+endif()
+if(NOT LIBOMP_HAVE___BUILTIN_FRAME_ADDRESS)
+  set(LIBOMP_HAVE_OMPT_SUPPORT FALSE)
+else()
+  if( # hardware architecture supported?
+     ((LIBOMP_ARCH STREQUAL x86_64) OR
+      (LIBOMP_ARCH STREQUAL i386) OR
+#      (LIBOMP_ARCH STREQUAL arm) OR
+      (LIBOMP_ARCH STREQUAL aarch64) OR
+      (LIBOMP_ARCH STREQUAL ppc64le) OR
+      (LIBOMP_ARCH STREQUAL ppc64))
+     AND # OS supported?
+     ((WIN32 AND LIBOMP_HAVE_PSAPI) OR APPLE OR (NOT WIN32 AND LIBOMP_HAVE_WEAK_ATTRIBUTE)))
+    set(LIBOMP_HAVE_OMPT_SUPPORT TRUE)
+  else()
+    set(LIBOMP_HAVE_OMPT_SUPPORT FALSE)
+  endif()
+endif()
+
+# Check if HWLOC support is available
+if(${LIBOMP_USE_HWLOC})
+  set(CMAKE_REQUIRED_INCLUDES ${LIBOMP_HWLOC_INSTALL_DIR}/include)
+  check_include_file(hwloc.h LIBOMP_HAVE_HWLOC_H)
+  set(CMAKE_REQUIRED_INCLUDES)
+  find_library(LIBOMP_HWLOC_LIBRARY
+    NAMES hwloc libhwloc
+    HINTS ${LIBOMP_HWLOC_INSTALL_DIR}/lib)
+  if(LIBOMP_HWLOC_LIBRARY)
+    check_library_exists(${LIBOMP_HWLOC_LIBRARY} hwloc_topology_init
+      ${LIBOMP_HWLOC_INSTALL_DIR}/lib LIBOMP_HAVE_LIBHWLOC)
+    get_filename_component(LIBOMP_HWLOC_LIBRARY_DIR ${LIBOMP_HWLOC_LIBRARY} PATH)
+  endif()
+  if(LIBOMP_HAVE_HWLOC_H AND LIBOMP_HAVE_LIBHWLOC AND LIBOMP_HWLOC_LIBRARY)
+    set(LIBOMP_HAVE_HWLOC TRUE)
+  else()
+    set(LIBOMP_HAVE_HWLOC FALSE)
+    libomp_say("Could not find hwloc")
+  endif()
+endif()
+
+# Check if ThreadSanitizer support is available
+if("${CMAKE_SYSTEM_NAME}" MATCHES "Linux" AND ${INTEL64})
+  set(LIBOMP_HAVE_TSAN_SUPPORT TRUE)
+else()
+  set(LIBOMP_HAVE_TSAN_SUPPORT FALSE)
+endif()
diff --git a/final/runtime/doc/Reference.pdf b/final/runtime/doc/Reference.pdf
new file mode 100644
index 0000000..e97c40c
--- /dev/null
+++ b/final/runtime/doc/Reference.pdf
Binary files differ
diff --git a/final/runtime/doc/doxygen/config b/final/runtime/doc/doxygen/config
new file mode 100644
index 0000000..cd1eca2
--- /dev/null
+++ b/final/runtime/doc/doxygen/config
@@ -0,0 +1,1822 @@
+# Doxyfile 1.o8.2
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = "LLVM OpenMP* Runtime Library"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc/doxygen/generated
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip. Note that you specify absolute paths here, but also
+# relative paths, which will be relative from the directory where doxygen is
+# started.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                = "other=<sup>*</sup>"
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension,
+# and language is one of the parsers supported by doxygen: IDL, Java,
+# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C,
+# C++. For instance to make doxygen treat .inc files as Fortran files (default
+# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note
+# that for custom extensions you also need to set FILE_PATTERNS otherwise the
+# files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
+# comments according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you
+# can mix doxygen, HTML, and XML commands with Markdown formatting.
+# Disable only in case of backward compatibilities issues.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented classes,
+# or namespaces to their corresponding documentation. Such a link can be
+# prevented in individual cases by by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to
+# indicate getter and setter methods for a property. Setting this
+# option to YES (the default) will make doxygen replace the get and
+# set methods by a property in the documentation. This will only work
+# if the methods are indeed getting or setting a simple type. If this
+# is not the case, or you want to show the methods anyway, you should
+# set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE      = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+# We probably will want this, but we have no file documentation yet so it's simpler to remove
+# it for now.
+SHOW_FILES             = NO
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            =
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = src  doc/doxygen/libomp_interface.h
+# The ittnotify code also has doxygen documentation, but if we include it here
+# it takes over from us!
+# src/thirdparty/ittnotify
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *.c *.h *.cpp
+# We may also want to include the asm files with appropriate ifdef to ensure
+# doxygen doesn't see the content, just the documentation...
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+# Only look in the one directory.
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = src/test-touch.c
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C, C++ and Fortran comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            =
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If left blank doxygen will
+# generate a default style sheet. Note that it is recommended to use
+# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this
+# tag will in the future become obsolete.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional
+# user-defined cascading style sheet that is included after the standard
+# style sheets created by doxygen. Using this option one can overrule
+# certain style aspects. This is preferred over using HTML_STYLESHEET
+# since it does not replace the standard style sheet and is therefor more
+# robust against future updates. Doxygen will copy the style sheet file to
+# the output directory.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
+# entries shown in the various tree structured indices initially; the user
+# can expand and collapse entries dynamically later on. Doxygen will expand
+# the tree to such a level that at most the specified number of entries are
+# visible (unless a fully collapsed tree already exceeds this amount).
+# So setting the number of entries 1 will produce a full collapsed tree by
+# default. 0 is a special value representing an infinite number of entries
+# and will result in a full expanded tree by default.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely
+# identify the documentation publisher. This should be a reverse domain-name
+# style string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you may also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to
+# the MathJax Content Delivery Network so you can quickly see the result without
+# installing MathJax.
+# However, it is strongly recommended to install a local
+# copy of MathJax from http://www.mathjax.org before deployment.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           =
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = doc/doxygen/header.tex
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             =
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             =
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          =
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. For each
+# tag file the location of the external documentation should be added. The
+# format of a tag file without this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths
+# or URLs. Note that each tag file must have a unique name (where the name does
+# NOT include the path). If a tag file is not located in the directory in which
+# doxygen is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside
+# the class node. If there are many fields or methods and many nodes the
+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
+# threshold limits the number of items for each type to make the size more
+# manageable. Set this to 0 for no limit. Note that the threshold may be
+# exceeded by 50% before the limit is enforced.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/final/runtime/doc/doxygen/header.tex b/final/runtime/doc/doxygen/header.tex
new file mode 100644
index 0000000..6e9ea67
--- /dev/null
+++ b/final/runtime/doc/doxygen/header.tex
@@ -0,0 +1,77 @@
+% Latex header for doxygen 1.8.3.1
+\documentclass{book}
+\usepackage[a4paper,top=2.5cm,bottom=2.5cm,left=2.5cm,right=2.5cm]{geometry}
+\usepackage{makeidx}
+\usepackage{natbib}
+\usepackage{graphicx}
+\usepackage{multicol}
+\usepackage{float}
+\usepackage{listings}
+\usepackage{color}
+\usepackage{ifthen}
+\usepackage[table]{xcolor}
+\usepackage{textcomp}
+\usepackage{alltt}
+\usepackage{ifpdf}
+\ifpdf
+\usepackage[pdftex,
+            pagebackref=true,
+            colorlinks=true,
+            linkcolor=blue,
+            unicode
+           ]{hyperref}
+\else
+\usepackage[ps2pdf,
+            pagebackref=true,
+            colorlinks=true,
+            linkcolor=blue,
+            unicode
+           ]{hyperref}
+\usepackage{pspicture}
+\fi
+\usepackage[utf8]{inputenc}
+\usepackage{mathptmx}
+\usepackage[scaled=.90]{helvet}
+\usepackage{courier}
+\usepackage{sectsty}
+\usepackage{amssymb}
+\usepackage[titles]{tocloft}
+\usepackage{doxygen}
+\lstset{language=C++,inputencoding=utf8,basicstyle=\footnotesize,breaklines=true,breakatwhitespace=true,tabsize=4,numbers=left }
+\makeindex
+\setcounter{tocdepth}{3}
+\renewcommand{\footrulewidth}{0.4pt}
+\renewcommand{\familydefault}{\sfdefault}
+\hfuzz=15pt
+\setlength{\emergencystretch}{15pt}
+\hbadness=750
+\tolerance=750
+\begin{document}
+\hypersetup{pageanchor=false,citecolor=blue}
+\begin{titlepage}
+\vspace*{7cm}
+\begin{center}
+{\Large LLVM OpenMP\textsuperscript{*} Runtime Library }\\
+\vspace*{1cm}
+{\large Generated by Doxygen $doxygenversion }\\
+\vspace*{0.5cm}
+{\small $datetime }\\
+\end{center}
+\end{titlepage}
+
+{\bf Trademarks}
+The OpenMP name and the OpenMP logo are registered trademarks of the OpenMP Architecture Review Board.
+
+Intel, Xeon, and Intel Xeon Phi are trademarks of Intel Corporation in the U.S. and/or other countries.
+
+This document is Copyright \textcopyright~\the\year the LLVM Project. It is
+subject to the same license terms as the LLVM OpenMP runtime.
+
+\textsuperscript{*} Other names and brands may be claimed as the property of others.
+
+\clearemptydoublepage
+\pagenumbering{roman}
+\tableofcontents
+\clearemptydoublepage
+\pagenumbering{arabic}
+\hypersetup{pageanchor=true,citecolor=blue}
diff --git a/final/runtime/doc/doxygen/libomp_interface.h b/final/runtime/doc/doxygen/libomp_interface.h
new file mode 100644
index 0000000..1039325
--- /dev/null
+++ b/final/runtime/doc/doxygen/libomp_interface.h
@@ -0,0 +1,331 @@
+// This file does not contain any code; it just contains additional text and formatting
+// for doxygen.
+
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+/*! @mainpage LLVM&nbsp; OpenMP* Runtime Library Interface
+@section sec_intro Introduction
+
+This document describes the interface provided by the
+LLVM &nbsp;OpenMP\other runtime library to the compiler.
+Routines that are directly called as simple functions by user code are
+not currently described here, since their definition is in the OpenMP
+specification available from http://openmp.org
+
+The aim here is to explain the interface from the compiler to the runtime.
+
+The overall design is described, and each function in the interface
+has its own description. (At least, that's the ambition, we may not be there yet).
+
+@section sec_building Quickly Building the Runtime
+For the impatient, we cover building the runtime as the first topic here.
+
+CMake is used to build the OpenMP runtime.  For details and a full list of options for the CMake build system,
+see <tt>README.rst</tt> in the source code repository.  These instructions will provide the most typical build.
+
+In-LLVM-tree build:.
+@code
+$ cd where-you-want-to-live
+Check out openmp into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omp
+@endcode
+Out-of-LLVM-tree build:
+@code
+$ cd where-you-want-to-live
+Check out openmp
+$ cd where-you-want-to-live/openmp
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+@endcode
+
+@section sec_supported Supported RTL Build Configurations
+
+The architectures supported are IA-32 architecture, Intel&reg;&nbsp; 64, and
+Intel&reg;&nbsp; Many Integrated Core Architecture.  The build configurations
+supported are shown in the table below.
+
+<table border=1>
+<tr><th> <th>icc/icl<th>gcc<th>clang
+<tr><td>Linux\other OS<td>Yes(1,5)<td>Yes(2,4)<td>Yes(4,6,7)
+<tr><td>FreeBSD\other<td>Yes(1,5)<td>Yes(2,4)<td>Yes(4,6,7,8)
+<tr><td>OS X\other<td>Yes(1,3,4)<td>No<td>Yes(4,6,7)
+<tr><td>Windows\other OS<td>Yes(1,4)<td>No<td>No
+</table>
+(1) On IA-32 architecture and Intel&reg;&nbsp; 64, icc/icl versions 12.x 
+    are supported (12.1 is recommended).<br>
+(2) gcc version 4.7 is supported.<br>
+(3) For icc on OS X\other, OS X\other version 10.5.8 is supported.<br>
+(4) Intel&reg;&nbsp; Many Integrated Core Architecture not supported.<br>
+(5) On Intel&reg;&nbsp; Many Integrated Core Architecture, icc/icl versions 13.0 or later are required.<br>
+(6) Clang\other version 3.3 is supported.<br>
+(7) Clang\other currently does not offer a software-implemented 128 bit extended
+    precision type.  Thus, all entry points reliant on this type are removed
+    from the library and cannot be called in the user program.  The following
+    functions are not available:
+@code
+    __kmpc_atomic_cmplx16_*
+    __kmpc_atomic_float16_*
+    __kmpc_atomic_*_fp
+@endcode
+(8) Community contribution provided AS IS, not tested by Intel.
+
+Supported Architectures: IBM(R) Power 7 and Power 8
+<table border=1>
+<tr><th> <th>gcc<th>clang
+<tr><td>Linux\other OS<td>Yes(1,2)<td>Yes(3,4)
+</table>
+(1) On Power 7, gcc version 4.8.2 is supported.<br>
+(2) On Power 8, gcc version 4.8.2 is supported.<br>
+(3) On Power 7, clang version 3.7 is supported.<br>
+(4) On Power 8, clang version 3.7 is supported.<br>
+
+@section sec_frontend Front-end Compilers that work with this RTL
+
+The following compilers are known to do compatible code generation for
+this RTL: icc/icl, gcc.  Code generation is discussed in more detail
+later in this document.
+
+@section sec_outlining Outlining
+
+The runtime interface is based on the idea that the compiler
+"outlines" sections of code that are to run in parallel into separate
+functions that can then be invoked in multiple threads.  For instance,
+simple code like this
+
+@code
+void foo()
+{
+#pragma omp parallel
+    {
+        ... do something ...
+    }
+}
+@endcode
+is converted into something that looks conceptually like this (where
+the names used are merely illustrative; the real library function
+names will be used later after we've discussed some more issues...)
+
+@code
+static void outlinedFooBody()
+{
+    ... do something ...
+}
+
+void foo()
+{
+    __OMP_runtime_fork(outlinedFooBody, (void*)0);   // Not the real function name!
+}
+@endcode
+
+@subsection SEC_SHAREDVARS Addressing shared variables
+
+In real uses of the OpenMP\other API there are normally references 
+from the outlined code  to shared variables that are in scope in the containing function. 
+Therefore the containing function must be able to address 
+these variables. The runtime supports two alternate ways of doing
+this.
+
+@subsubsection SEC_SEC_OT Current Technique
+The technique currently supported by the runtime library is to receive
+a separate pointer to each shared variable that can be accessed from
+the outlined function.  This is what is shown in the example below.
+
+We hope soon to provide an alternative interface to support the
+alternate implementation described in the next section. The
+alternative implementation has performance advantages for small
+parallel regions that have many shared variables.
+
+@subsubsection SEC_SEC_PT Future Technique
+The idea is to treat the outlined function as though it
+were a lexically nested function, and pass it a single argument which
+is the pointer to the parent's stack frame. Provided that the compiler
+knows the layout of the parent frame when it is generating the outlined
+function it can then access the up-level variables at appropriate
+offsets from the parent frame.  This is a classical compiler technique
+from the 1960s to support languages like Algol (and its descendants)
+that support lexically nested functions.
+
+The main benefit of this technique is that there is no code required
+at the fork point to marshal the arguments to the outlined function.
+Since the runtime knows statically how many arguments must be passed to the
+outlined function, it can easily copy them to the thread's stack
+frame.  Therefore the performance of the fork code is independent of
+the number of shared variables that are accessed by the outlined
+function.
+
+If it is hard to determine the stack layout of the parent while generating the
+outlined code, it is still possible to use this approach by collecting all of
+the variables in the parent that are accessed from outlined functions into
+a single `struct` which is placed on the stack, and whose address is passed
+to the outlined functions. In this way the offsets of the shared variables
+are known (since they are inside the struct) without needing to know
+the complete layout of the parent stack-frame. From the point of view
+of the runtime either of these techniques is equivalent, since in either
+case it only has to pass a single argument to the outlined function to allow 
+it to access shared variables.
+
+A scheme like this is how gcc\other generates outlined functions.
+
+@section SEC_INTERFACES Library Interfaces
+The library functions used for specific parts of the OpenMP\other language implementation
+are documented in different modules.
+
+ - @ref BASIC_TYPES fundamental types used by the runtime in many places
+ - @ref DEPRECATED  functions that are in the library but are no longer required
+ - @ref STARTUP_SHUTDOWN functions for initializing and finalizing the runtime
+ - @ref PARALLEL functions for implementing `omp parallel`
+ - @ref THREAD_STATES functions for supporting thread state inquiries
+ - @ref WORK_SHARING functions for work sharing constructs such as `omp for`, `omp sections`
+ - @ref THREADPRIVATE functions to support thread private data, copyin etc
+ - @ref SYNCHRONIZATION functions to support `omp critical`, `omp barrier`, `omp master`, reductions etc
+ - @ref ATOMIC_OPS functions to support atomic operations
+ - @ref STATS_GATHERING macros to support developer profiling of libomp
+ - Documentation on tasking has still to be written...
+
+@section SEC_EXAMPLES Examples
+@subsection SEC_WORKSHARING_EXAMPLE Work Sharing Example
+This example shows the code generated for a parallel for with reduction and dynamic scheduling.
+
+@code
+extern float foo( void );
+
+int main () {
+    int i; 
+    float r = 0.0; 
+    #pragma omp parallel for schedule(dynamic) reduction(+:r) 
+    for ( i = 0; i < 10; i ++ ) {
+        r += foo(); 
+    }
+}
+@endcode
+
+The transformed code looks like this.
+@code
+extern float foo( void ); 
+
+int main () {
+    static int zero = 0; 
+    auto int gtid; 
+    auto float r = 0.0; 
+    __kmpc_begin( & loc3, 0 ); 
+    // The gtid is not actually required in this example so could be omitted;
+    // We show its initialization here because it is often required for calls into
+    // the runtime and should be locally cached like this.
+    gtid = __kmpc_global thread num( & loc3 ); 
+    __kmpc_fork call( & loc7, 1, main_7_parallel_3, & r ); 
+    __kmpc_end( & loc0 ); 
+    return 0; 
+}
+
+struct main_10_reduction_t_5 { float r_10_rpr; }; 
+
+static kmp_critical_name lck = { 0 };
+static ident_t loc10; // loc10.flags should contain KMP_IDENT_ATOMIC_REDUCE bit set 
+                      // if compiler has generated an atomic reduction.
+
+void main_7_parallel_3( int *gtid, int *btid, float *r_7_shp ) {
+    auto int i_7_pr; 
+    auto int lower, upper, liter, incr; 
+    auto struct main_10_reduction_t_5 reduce; 
+    reduce.r_10_rpr = 0.F; 
+    liter = 0; 
+    __kmpc_dispatch_init_4( & loc7,*gtid, 35, 0, 9, 1, 1 ); 
+    while ( __kmpc_dispatch_next_4( & loc7, *gtid, & liter, & lower, & upper, & incr ) ) {
+        for( i_7_pr = lower; upper >= i_7_pr; i_7_pr ++ ) 
+          reduce.r_10_rpr += foo(); 
+    }
+    switch( __kmpc_reduce_nowait( & loc10, *gtid, 1, 4, & reduce, main_10_reduce_5, & lck ) ) {
+        case 1:
+           *r_7_shp += reduce.r_10_rpr;
+           __kmpc_end_reduce_nowait( & loc10, *gtid, & lck );
+           break;
+        case 2:
+           __kmpc_atomic_float4_add( & loc10, *gtid, r_7_shp, reduce.r_10_rpr );
+           break;
+        default:;
+    }
+} 
+
+void main_10_reduce_5( struct main_10_reduction_t_5 *reduce_lhs, 
+                       struct main_10_reduction_t_5 *reduce_rhs ) 
+{ 
+    reduce_lhs->r_10_rpr += reduce_rhs->r_10_rpr; 
+}
+@endcode
+
+@defgroup BASIC_TYPES Basic Types
+Types that are used throughout the runtime.
+
+@defgroup DEPRECATED Deprecated Functions
+Functions in this group are for backwards compatibility only, and
+should not be used in new code.
+
+@defgroup STARTUP_SHUTDOWN Startup and Shutdown
+These functions are for library initialization and shutdown.
+
+@defgroup PARALLEL Parallel (fork/join)
+These functions are used for implementing <tt>\#pragma omp parallel</tt>.
+
+@defgroup THREAD_STATES Thread Information
+These functions return information about the currently executing thread.
+
+@defgroup WORK_SHARING Work Sharing
+These functions are used for implementing 
+<tt>\#pragma omp for</tt>, <tt>\#pragma omp sections</tt>, <tt>\#pragma omp single</tt> and 
+<tt>\#pragma omp master</tt> constructs. 
+
+When handling loops, there are different functions for each of the signed and unsigned 32 and 64 bit integer types
+which have the name suffixes `_4`, `_4u`, `_8` and `_8u`. The semantics of each of the functions is the same,
+so they are only described once.
+
+Static loop scheduling is handled by  @ref __kmpc_for_static_init_4 and friends. Only a single call is needed,
+since the iterations to be executed by any give thread can be determined as soon as the loop parameters are known.
+
+Dynamic scheduling is handled by the @ref __kmpc_dispatch_init_4 and @ref __kmpc_dispatch_next_4 functions. 
+The init function is called once in each thread outside the loop, while the next function is called each
+time that the previous chunk of work has been exhausted. 
+
+@defgroup SYNCHRONIZATION Synchronization
+These functions are used for implementing barriers.
+
+@defgroup THREADPRIVATE Thread private data support
+These functions support copyin/out and thread private data.
+
+@defgroup STATS_GATHERING Statistics Gathering from OMPTB
+These macros support profiling the libomp library.  Use --stats=on when building with build.pl to enable
+and then use the KMP_* macros to profile (through counts or clock ticks) libomp during execution of an OpenMP program.
+
+@section sec_stats_env_vars Environment Variables
+
+This section describes the environment variables relevant to stats-gathering in libomp
+
+@code
+KMP_STATS_FILE
+@endcode
+This environment variable is set to an output filename that will be appended *NOT OVERWRITTEN* if it exists.  If this environment variable is undefined, the statistics will be output to stderr
+
+@code
+KMP_STATS_THREADS
+@endcode
+This environment variable indicates to print thread-specific statistics as well as aggregate statistics.  Each thread's statistics will be shown as well as the collective sum of all threads.  The values "true", "on", "1", "yes" will all indicate to print per thread statistics.
+
+@defgroup TASKING Tasking support
+These functions support tasking constructs.
+
+@defgroup USER User visible functions
+These functions can be called directly by the user, but are runtime library specific, rather than being OpenMP interfaces.
+
+*/
+
diff --git a/final/runtime/src/CMakeLists.txt b/final/runtime/src/CMakeLists.txt
new file mode 100644
index 0000000..7956ae0
--- /dev/null
+++ b/final/runtime/src/CMakeLists.txt
@@ -0,0 +1,335 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Configure omp.h, kmp_config.h and omp-tools.h if necessary
+configure_file(${LIBOMP_INC_DIR}/omp.h.var omp.h @ONLY)
+configure_file(kmp_config.h.cmake kmp_config.h @ONLY)
+if(${LIBOMP_OMPT_SUPPORT})
+  configure_file(${LIBOMP_INC_DIR}/omp-tools.h.var omp-tools.h @ONLY)
+endif()
+
+# Generate message catalog files: kmp_i18n_id.inc and kmp_i18n_default.inc
+add_custom_command(
+  OUTPUT  kmp_i18n_id.inc
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/message-converter.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --prefix=kmp_i18n --enum=kmp_i18n_id.inc ${LIBOMP_SRC_DIR}/i18n/en_US.txt
+  DEPENDS ${LIBOMP_SRC_DIR}/i18n/en_US.txt ${LIBOMP_TOOLS_DIR}/message-converter.pl
+)
+add_custom_command(
+  OUTPUT  kmp_i18n_default.inc
+  COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/message-converter.pl --os=${LIBOMP_PERL_SCRIPT_OS}
+    --prefix=kmp_i18n --default=kmp_i18n_default.inc ${LIBOMP_SRC_DIR}/i18n/en_US.txt
+  DEPENDS ${LIBOMP_SRC_DIR}/i18n/en_US.txt ${LIBOMP_TOOLS_DIR}/message-converter.pl
+)
+
+# Set the -D definitions for all sources
+# UNICODE and _UNICODE are set in LLVM's CMake system.  They affect the
+# ittnotify code and should only be set when compiling ittnotify_static.c
+# on Windows (done below).
+# TODO: Fix the UNICODE usage in ittnotify code for Windows.
+remove_definitions(-DUNICODE -D_UNICODE)
+libomp_get_definitions_flags(LIBOMP_CONFIGURED_DEFINITIONS_FLAGS)
+add_definitions(${LIBOMP_CONFIGURED_DEFINITIONS_FLAGS})
+
+# Set the -I includes for all sources
+include_directories(
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${LIBOMP_SRC_DIR}
+  ${LIBOMP_SRC_DIR}/i18n
+  ${LIBOMP_INC_DIR}
+  ${LIBOMP_SRC_DIR}/thirdparty/ittnotify
+)
+if(${LIBOMP_USE_HWLOC})
+  include_directories(${LIBOMP_HWLOC_INSTALL_DIR}/include)
+endif()
+
+# Getting correct source files to build library
+set(LIBOMP_CFILES)
+set(LIBOMP_CXXFILES)
+set(LIBOMP_ASMFILES)
+if(${STUBS_LIBRARY})
+  set(LIBOMP_CFILES kmp_stub.cpp)
+else()
+  # Get C++ files
+  set(LIBOMP_CXXFILES
+    kmp_alloc.cpp
+    kmp_atomic.cpp
+    kmp_csupport.cpp
+    kmp_debug.cpp
+    kmp_itt.cpp
+    kmp_environment.cpp
+    kmp_error.cpp
+    kmp_global.cpp
+    kmp_i18n.cpp
+    kmp_io.cpp
+    kmp_runtime.cpp
+    kmp_settings.cpp
+    kmp_str.cpp
+    kmp_tasking.cpp
+    kmp_threadprivate.cpp
+    kmp_utility.cpp
+    kmp_barrier.cpp
+    kmp_wait_release.cpp
+    kmp_affinity.cpp
+    kmp_dispatch.cpp
+    kmp_lock.cpp
+    kmp_sched.cpp
+  )
+  if(WIN32)
+    # Windows specific files
+    libomp_append(LIBOMP_CXXFILES z_Windows_NT_util.cpp)
+    libomp_append(LIBOMP_CXXFILES z_Windows_NT-586_util.cpp)
+    libomp_append(LIBOMP_ASMFILES z_Windows_NT-586_asm.asm) # Windows assembly file
+  else()
+    # Unix specific files
+    libomp_append(LIBOMP_CXXFILES z_Linux_util.cpp)
+    libomp_append(LIBOMP_CXXFILES kmp_gsupport.cpp)
+    libomp_append(LIBOMP_ASMFILES z_Linux_asm.S) # Unix assembly file
+  endif()
+  libomp_append(LIBOMP_CFILES thirdparty/ittnotify/ittnotify_static.c LIBOMP_USE_ITT_NOTIFY)
+  libomp_append(LIBOMP_CXXFILES kmp_debugger.cpp LIBOMP_USE_DEBUGGER)
+  libomp_append(LIBOMP_CXXFILES kmp_stats.cpp LIBOMP_STATS)
+  libomp_append(LIBOMP_CXXFILES kmp_stats_timing.cpp LIBOMP_STATS)
+  libomp_append(LIBOMP_CXXFILES kmp_taskdeps.cpp)
+  libomp_append(LIBOMP_CXXFILES kmp_cancel.cpp)
+endif()
+# Files common to stubs and normal library
+libomp_append(LIBOMP_CXXFILES kmp_ftn_cdecl.cpp)
+libomp_append(LIBOMP_CXXFILES kmp_ftn_extra.cpp)
+libomp_append(LIBOMP_CXXFILES kmp_version.cpp)
+libomp_append(LIBOMP_CXXFILES ompt-general.cpp IF_TRUE LIBOMP_OMPT_SUPPORT)
+libomp_append(LIBOMP_CXXFILES tsan_annotations.cpp IF_TRUE LIBOMP_TSAN_SUPPORT)
+
+set(LIBOMP_SOURCE_FILES ${LIBOMP_CFILES} ${LIBOMP_CXXFILES} ${LIBOMP_ASMFILES})
+# For Windows, there is a resource file (.rc -> .res) that is also compiled
+libomp_append(LIBOMP_SOURCE_FILES libomp.rc WIN32)
+
+# Get compiler and assembler flags
+libomp_get_cflags(LIBOMP_CONFIGURED_CFLAGS)
+libomp_get_cxxflags(LIBOMP_CONFIGURED_CXXFLAGS)
+libomp_get_asmflags(LIBOMP_CONFIGURED_ASMFLAGS)
+# Set the compiler flags for each type of source
+set_source_files_properties(${LIBOMP_CFILES} PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CFLAGS}")
+set_source_files_properties(${LIBOMP_CXXFILES} PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CXXFLAGS}")
+set_source_files_properties(${LIBOMP_ASMFILES} PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_ASMFLAGS}")
+# Let the compiler handle the assembly files on Unix-like systems
+if(NOT WIN32)
+  set_source_files_properties(${LIBOMP_ASMFILES} PROPERTIES LANGUAGE C)
+endif()
+
+# Remove any cmake-automatic linking of the standard C++ library.
+# We neither need (nor want) the standard C++ library dependency even though we compile c++ files.
+if(NOT ${LIBOMP_USE_STDCPPLIB})
+  set(LIBOMP_LINKER_LANGUAGE C)
+  set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES)
+else()
+  set(LIBOMP_LINKER_LANGUAGE CXX)
+endif()
+
+# Add the OpenMP library
+libomp_get_ldflags(LIBOMP_CONFIGURED_LDFLAGS)
+
+add_library(omp ${LIBOMP_LIBRARY_KIND} ${LIBOMP_SOURCE_FILES})
+
+set_target_properties(omp PROPERTIES
+  PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_LIB_FILE}"
+  LINK_FLAGS "${LIBOMP_CONFIGURED_LDFLAGS}"
+  LINKER_LANGUAGE ${LIBOMP_LINKER_LANGUAGE}
+)
+
+# Get the library's location within the build tree for the unit tester
+if(NOT WIN32)
+  get_target_property(LIBOMP_LIBRARY_DIR omp LIBRARY_OUTPUT_DIRECTORY)
+else()
+  get_target_property(LIBOMP_LIBRARY_DIR omp RUNTIME_OUTPUT_DIRECTORY)
+endif()
+if(NOT LIBOMP_LIBRARY_DIR)
+  set(LIBOMP_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+  set(LIBOMP_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
+else()
+  set(LIBOMP_LIBRARY_DIR ${LIBOMP_LIBRARY_DIR} PARENT_SCOPE)
+endif()
+
+# Add symbolic links to libomp
+if(NOT WIN32)
+  add_custom_command(TARGET omp POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+      libgomp${LIBOMP_LIBRARY_SUFFIX}
+    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+      libiomp5${LIBOMP_LIBRARY_SUFFIX}
+    WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+  )
+endif()
+
+# Linking command will include libraries in LIBOMP_CONFIGURED_LIBFLAGS
+libomp_get_libflags(LIBOMP_CONFIGURED_LIBFLAGS)
+target_link_libraries(omp ${LIBOMP_CONFIGURED_LIBFLAGS} ${CMAKE_DL_LIBS})
+
+# Create *.inc before compiling any sources
+# objects depend on : .inc files
+add_custom_target(libomp-needed-headers DEPENDS kmp_i18n_id.inc kmp_i18n_default.inc)
+add_dependencies(omp libomp-needed-headers)
+
+# Windows specific build rules
+if(WIN32)
+  configure_file(libomp.rc.var libomp.rc @ONLY)
+
+  # Create .def and .rc file before compiling any sources
+  add_custom_target(libomp-needed-windows-files DEPENDS ${LIBOMP_LIB_NAME}.def)
+  add_dependencies(omp libomp-needed-windows-files)
+  # z_Windows_NT-586_asm.asm requires definitions to be sent via command line
+  # It only needs the architecutre macro and OMPT_SUPPORT=0|1
+  libomp_append(LIBOMP_MASM_DEFINITIONS "-D_M_IA32" IF_TRUE IA32)
+  libomp_append(LIBOMP_MASM_DEFINITIONS "-D_M_AMD64" IF_TRUE INTEL64)
+  libomp_append(LIBOMP_MASM_DEFINITIONS "-DOMPT_SUPPORT" IF_TRUE_1_0 LIBOMP_OMPT_SUPPORT)
+  libomp_list_to_string("${LIBOMP_MASM_DEFINITIONS}" LIBOMP_MASM_DEFINITIONS)
+  set_property(SOURCE z_Windows_NT-586_asm.asm APPEND_STRING PROPERTY COMPILE_FLAGS " ${LIBOMP_MASM_DEFINITIONS}")
+  set_source_files_properties(thirdparty/ittnotify/ittnotify_static.c PROPERTIES COMPILE_DEFINITIONS "UNICODE")
+
+  # Create Windows import library
+  # the import library is "re-linked" to include kmp_import.cpp which prevents
+  # linking of both Visual Studio OpenMP and newly built OpenMP
+  set_source_files_properties(kmp_import.cpp PROPERTIES COMPILE_FLAGS "${LIBOMP_CONFIGURED_CFLAGS}")
+  set(LIBOMP_IMP_LIB_FILE ${LIBOMP_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(LIBOMP_GENERATED_IMP_LIB_FILENAME ${LIBOMP_LIB_FILE}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set_target_properties(omp PROPERTIES
+    VERSION ${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR} # uses /version flag
+    IMPORT_PREFIX "" IMPORT_SUFFIX "" # control generated import library name when building omp
+    ARCHIVE_OUTPUT_NAME ${LIBOMP_GENERATED_IMP_LIB_FILENAME}
+  )
+  # Get generated import library from creating omp
+  get_target_property(LIBOMP_IMPORT_LIB_DIRECTORY omp ARCHIVE_OUTPUT_DIRECTORY)
+  if(LIBOMP_IMPORT_LIB_DIRECTORY)
+    set(LIBOMP_GENERATED_IMP_LIB ${LIBOMP_IMPORT_LIB_DIRECTORY}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+  else()
+    set(LIBOMP_GENERATED_IMP_LIB ${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_GENERATED_IMP_LIB_FILENAME})
+  endif()
+  set_source_files_properties(${LIBOMP_GENERATED_IMP_LIB} PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE)
+  # Create new import library that is just the previously created one + kmp_import.cpp
+  add_library(ompimp STATIC ${LIBOMP_GENERATED_IMP_LIB} kmp_import.cpp)
+  set_target_properties(ompimp PROPERTIES
+    PREFIX "" SUFFIX "" OUTPUT_NAME "${LIBOMP_IMP_LIB_FILE}"
+    LINKER_LANGUAGE C
+  )
+  add_dependencies(ompimp omp) # ensure generated import library is created first
+
+  # Create def file to designate exported functions
+  libomp_get_gdflags(LIBOMP_GDFLAGS) # generate-def.pl flags (Windows only)
+  libomp_string_to_list("${LIBOMP_GDFLAGS}" LIBOMP_GDFLAGS)
+  add_custom_command(
+    OUTPUT  ${LIBOMP_LIB_NAME}.def
+    COMMAND ${PERL_EXECUTABLE} ${LIBOMP_TOOLS_DIR}/generate-def.pl ${LIBOMP_GDFLAGS}
+      -o ${LIBOMP_LIB_NAME}.def ${CMAKE_CURRENT_SOURCE_DIR}/dllexports
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/dllexports ${LIBOMP_TOOLS_DIR}/generate-def.pl
+  )
+endif()
+
+# Building the Fortran module files
+# One compilation step creates both omp_lib.mod and omp_lib_kinds.mod
+if(${LIBOMP_FORTRAN_MODULES})
+  configure_file(${LIBOMP_INC_DIR}/omp_lib.h.var omp_lib.h @ONLY)
+  configure_file(${LIBOMP_INC_DIR}/omp_lib.f.var omp_lib.f @ONLY)
+  configure_file(${LIBOMP_INC_DIR}/omp_lib.f90.var omp_lib.f90 @ONLY)
+  # Workaround for gfortran to build modules with the
+  # omp_sched_monotonic integer parameter
+  if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
+    set(ADDITIONAL_Fortran_FLAGS "-fno-range-check")
+  endif()
+  add_custom_target(libomp-mod ALL DEPENDS omp_lib.mod omp_lib_kinds.mod)
+  libomp_get_fflags(LIBOMP_CONFIGURED_FFLAGS)
+  if(CMAKE_Fortran_COMPILER_SUPPORTS_F90)
+    set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.f90)
+  else()
+    set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.f)
+  endif()
+  add_custom_command(
+    OUTPUT omp_lib.mod omp_lib_kinds.mod
+    COMMAND ${CMAKE_Fortran_COMPILER} -c ${ADDITIONAL_Fortran_FLAGS}
+            ${LIBOMP_CONFIGURED_FFLAGS} ${LIBOMP_FORTRAN_SOURCE_FILE}
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_FORTRAN_SOURCE_FILE}
+      ${CMAKE_CURRENT_BINARY_DIR}/omp_lib.h
+  )
+  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES omp_lib${CMAKE_C_OUTPUT_EXTENSION})
+endif()
+
+# Move files to exports/ directory if requested
+if(${LIBOMP_COPY_EXPORTS})
+  include(LibompExports)
+endif()
+
+# Micro test rules for after library has been built (cmake/LibompMicroTests.cmake)
+include(LibompMicroTests)
+add_custom_target(libomp-micro-tests)
+if(NOT ${MIC} AND NOT CMAKE_CROSSCOMPILING)
+  add_dependencies(libomp-micro-tests libomp-test-touch)
+endif()
+if(NOT WIN32 AND NOT APPLE)
+  add_dependencies(libomp-micro-tests libomp-test-relo)
+endif()
+if(NOT WIN32 AND NOT APPLE)
+  add_dependencies(libomp-micro-tests libomp-test-execstack)
+endif()
+if(${MIC})
+  add_dependencies(libomp-micro-tests libomp-test-instr)
+endif()
+add_dependencies(libomp-micro-tests libomp-test-deps)
+
+# Install rules
+# We want to install libomp in DESTDIR/CMAKE_INSTALL_PREFIX/lib
+# We want to install headers in DESTDIR/CMAKE_INSTALL_PREFIX/include
+if(${OPENMP_STANDALONE_BUILD})
+  set(LIBOMP_HEADERS_INSTALL_PATH include)
+else()
+  string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" CLANG_VERSION ${PACKAGE_VERSION})
+  set(LIBOMP_HEADERS_INSTALL_PATH "${OPENMP_INSTALL_LIBDIR}/clang/${CLANG_VERSION}/include")
+endif()
+if(WIN32)
+  install(TARGETS omp RUNTIME DESTINATION bin)
+  install(TARGETS ompimp ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+  # Create aliases (regular copies) of the library for backwards compatibility
+  set(LIBOMP_ALIASES "libiomp5md")
+  foreach(alias IN LISTS LIBOMP_ALIASES)
+    install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E copy \"${LIBOMP_LIB_FILE}\"
+      \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/bin)")
+    install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E copy \"${LIBOMP_IMP_LIB_FILE}\"
+      \"${alias}${CMAKE_STATIC_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
+  endforeach()
+else()
+
+  install(TARGETS omp ${LIBOMP_INSTALL_KIND} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+
+  if(${LIBOMP_INSTALL_ALIASES})
+    # Create aliases (symlinks) of the library for backwards compatibility
+    set(LIBOMP_ALIASES "libgomp;libiomp5")
+    foreach(alias IN LISTS LIBOMP_ALIASES)
+      install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${LIBOMP_LIB_FILE}\"
+        \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
+        \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
+    endforeach()
+  endif()
+endif()
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/omp.h
+  DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH}
+)
+if(${LIBOMP_OMPT_SUPPORT})
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/omp-tools.h DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH})
+  # install under legacy name ompt.h
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/omp-tools.h DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH} RENAME ompt.h)
+endif()
+if(${LIBOMP_FORTRAN_MODULES})
+  install(FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/omp_lib.h
+    ${CMAKE_CURRENT_BINARY_DIR}/omp_lib.mod
+    ${CMAKE_CURRENT_BINARY_DIR}/omp_lib_kinds.mod
+    DESTINATION ${LIBOMP_HEADERS_INSTALL_PATH}
+  )
+endif()
diff --git a/final/runtime/src/dllexports b/final/runtime/src/dllexports
new file mode 100644
index 0000000..f76619e
--- /dev/null
+++ b/final/runtime/src/dllexports
@@ -0,0 +1,1195 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Deprecated entry points (numbers are reserved):
+- __kmpc_barrier_reduce_master              109
+- __kmpc_end_barrier_reduce_master          122
+- __kmpc_for_init_4                         131
+- __kmpc_for_init_8                         132
+- __kmpc_for_next_4                         133
+- __kmpc_for_next_8                         134
+- __kmpc_fork_call_bound                    139
+- __kmpc_reduce_master_nowait               149
+- __kmpc_omp_task_begin                     194
+- __kmpc_omp_task_complete                  195
+- kmpc_sharable_calloc                      218
+- kmpc_sharable_free                        219
+- kmpc_sharable_malloc                      220
+- kmpc_sharable_realloc                     221
+- kmpc_aligned_sharable_malloc              223
+- mpai4a                                    500
+- mpai8a                                    501
+- mpar4a                                    502
+- mpar8a                                    503
+- mpax4x                                    504
+- mpax8x                                    505
+- mpobar                                    506
+- mpoebr                                    507
+- mpofork                                   508
+- mpofrk                                    509
+- mpojoin                                   510
+- mpoxbr                                    511
+- mppadj                                    512
+- mppaff                                    513
+- mppbar                                    514
+- mppbeg                                    515
+- mppdeo                                    516
+- mppdnx                                    517
+- mppdnxd                                   518
+- mppdon                                    519
+- mppdxo                                    520
+- mppebr                                    521
+- mppecs                                    522
+- mppems                                    523
+- mppenc                                    524
+- mppend                                    525
+- mppepa                                    526
+- mppesp                                    527
+- mppfkd                                    528
+- mppfkt                                    529
+- mppfork                                   530
+- mppfrk                                    531
+- mppioa                                    532
+- mppiws                                    533
+- mppjoin                                   534
+- mppnth                                    535
+- mpppqa                                    536
+- mpppqc                                    537
+- mpppqs                                    538
+- mpptid                                    539
+- mpptpa                                    540
+- mpptpc                                    541
+- mpptpz                                    542
+- mppvsy                                    543
+- mppxbr                                    544
+- mppxcs                                    545
+- mppxms                                    546
+- mppxnc                                    547
+- mppxpa                                    548
+- mppxpr                                    549
+- mppxsp                                    550
+- mppxth                                    551
+- mpsbar                                    552
+- mpscpr                                    597
+- mpsebr                                    553
+- mpserd                                    554
+- mpsfd4                                    555
+- mpsfd8                                    556
+- mpsid4                                    557
+- mpsid8                                    558
+- mpsnd4                                    559
+- mpsnd8                                    560
+- mpsont                                    561
+- mpsred                                    562
+- mpsunt                                    563
+- mpsxbr                                    564
+- mpsxrd                                    565
+- mptadj                                    566
+- mptaff                                    567
+- mptbar                                    568
+- mptdeo                                    569
+- mptdin                                    570
+- mptdind                                   571
+- mptdnx                                    572
+- mptdnxd                                   573
+- mptdon                                    574
+- mptdxo                                    575
+- mptebr                                    576
+- mptecs                                    577
+- mptems                                    578
+- mptenc                                    579
+- mptepa                                    580
+- mptesp                                    581
+- mptfkd                                    582
+- mptppa                                    583
+- mptppc                                    584
+- mptpps                                    585
+- mpttpa                                    586
+- mpttpc                                    587
+- mpttpz                                    588
+- mptvsy                                    589
+- mptxbr                                    590
+- mptxcs                                    591
+- mptxms                                    592
+- mptxnc                                    593
+- mptxpa                                    594
+- mptxsp                                    595
+- mppcpr                                    596
+- ftn_set_library_gang                      736
+- kmp_set_library_gang
+- kmp_sharable_calloc                       760
+- kmp_sharable_free                         761
+- kmp_sharable_malloc                       762
+- kmp_sharable_realloc                      763
+- kmp_aligned_sharable_malloc               764
+- kmp_deferred_atomic_add_i4                765
+- kmp_deferred_atomic_add_i8                766
+- kmp_deferred_atomic_add_r4                767
+- kmp_deferred_atomic_add_r8                768
+- kmp_lock_cond_wait                        770
+- kmp_lock_cond_signal                      771
+- kmp_lock_cond_broadcast                   772
+- kmp_nest_lock_cond_wait                   773
+- kmp_nest_lock_cond_signal                 774
+- kmp_nest_lock_cond_broadcast              775
+- kmp_get_process_num                       781
+- kmp_get_num_processes                     782
+- kmp_get_process_thread_num                783
+- kmp_private_mmap                          784   # not implemented?
+- kmp_sharable_mmap                         785   # not implemented?
+- kmp_private_munmap                        786   # not implemented?
+- kmp_sharable_munmap                       787   # not implemented?
+- kmp_is_sharable                           788   # not implemented?
+
+%ifndef stub
+
+
+    #
+    # The following entry points are added so that the backtraces from
+    # the tools contain meaningful names for all the functions that might
+    # appear in a backtrace of a thread which is blocked in the RTL.
+    #
+
+    # Regular entry points
+        __kmp_wait_4
+        __kmp_fork_call
+        __kmp_invoke_microtask
+    %ifdef KMP_USE_MONITOR
+        __kmp_launch_monitor
+        __kmp_reap_monitor
+    %endif
+        __kmp_launch_worker
+        __kmp_reap_worker
+        __kmp_acquire_tas_lock
+        __kmp_acquire_nested_tas_lock
+        __kmp_acquire_ticket_lock
+        __kmp_acquire_nested_ticket_lock
+        __kmp_acquire_queuing_lock
+        __kmp_acquire_nested_queuing_lock
+        __kmp_acquire_drdpa_lock
+        __kmp_acquire_nested_drdpa_lock
+
+    %ifdef KMP_DEBUG
+        # allows console output capability for applications those don't have it
+        __kmp_printf
+    %endif
+
+    %ifdef USE_DEBUGGER
+        __kmp_debugging                         DATA
+        __kmp_omp_debug_struct_info             DATA
+    %endif
+
+    # Symbols for MS mutual detection:
+    _You_must_link_with_exactly_one_OpenMP_library    DATA
+    _You_must_link_with_Intel_OpenMP_library          DATA
+    __kmp_wait_64
+    __kmp_release_64
+
+#    VT_getthid                              1
+#    vtgthid                                 2
+
+    __kmpc_atomic_4                         100
+    __kmpc_atomic_8                         101
+    __kmpc_atomic_fixed4_add                102
+    __kmpc_atomic_fixed8_add                103
+    __kmpc_atomic_float4_add                104
+    __kmpc_atomic_float8_add                105
+    __kmpc_barrier                          106
+    __kmpc_barrier_master                   107
+    __kmpc_barrier_master_nowait            108
+    __kmpc_begin                            110
+    __kmpc_bound_num_threads                111
+    __kmpc_bound_thread_num                 112
+    __kmpc_critical                         113
+    __kmpc_dispatch_fini_4                  114
+    __kmpc_dispatch_fini_8                  115
+    __kmpc_dispatch_init_4                  116
+    __kmpc_dispatch_init_8                  117
+    __kmpc_dispatch_next_4                  118
+    __kmpc_dispatch_next_8                  119
+    __kmpc_end                              120
+    __kmpc_end_barrier_master               121
+    __kmpc_end_critical                     123
+    __kmpc_end_master                       124
+    __kmpc_end_ordered                      125
+    __kmpc_end_serialized_parallel          126
+    __kmpc_end_single                       127
+#    __kmpc_end_taskq                        128
+#    __kmpc_end_taskq_task                   129
+    __kmpc_flush                            130
+    __kmpc_for_static_fini                  135
+    __kmpc_for_static_init_4                136
+    __kmpc_for_static_init_8                137
+    __kmpc_fork_call                        138
+    __kmpc_global_num_threads               140
+    __kmpc_global_thread_num                141
+    __kmpc_in_parallel                      142
+    __kmpc_invoke_task_func                 143
+    __kmpc_master                           144
+    __kmpc_ok_to_fork                       145
+    __kmpc_ordered                          146
+    __kmpc_pop_num_threads                  147
+    __kmpc_push_num_threads                 148
+    __kmpc_serialized_parallel              150
+    __kmpc_single                           151
+#    __kmpc_task                             152
+#    __kmpc_task_buffer                      153
+#    __kmpc_taskq                            154
+#    __kmpc_taskq_task                       155
+    __kmpc_threadprivate                    156
+    __kmpc_threadprivate_cached             157
+    __kmpc_threadprivate_register           158
+    __kmpc_threadprivate_register_vec       159
+#    __kmpc_ssp_begin                        160
+#    __kmpc_ssp_fork                         161
+#    __kmpc_ssp_end                          162
+#    __kmpc_ssp_post_4                       163
+#    __kmpc_ssp_post_8                       164
+#    __kmpc_ssp_wait_4                       165
+#    __kmpc_ssp_wait_8                       166
+#    __kmpc_ssp_distance_4                   167
+#    __kmpc_ssp_distance_8                   168
+#    __kmpc_in_ssp                           169
+#    __kmpc_ssp_thread_num                   170
+#    __kmpc_ssp_num_threads                  171
+    __kmpc_copyprivate                      172
+#    __kmpc_ssp_get_max_threads              173
+#    __kmpc_ssp_set_max_threads              174
+    __kmpc_init_lock                        175
+    __kmpc_destroy_lock                     176
+    __kmpc_set_lock                         177
+    __kmpc_unset_lock                       178
+    __kmpc_test_lock                        179
+    __kmpc_init_nest_lock                   180
+    __kmpc_destroy_nest_lock                181
+    __kmpc_set_nest_lock                    182
+    __kmpc_unset_nest_lock                  183
+    __kmpc_test_nest_lock                   184
+#    __kmpc_ssp_init_thread                  185
+#    __kmpc_ssp_set_event                    186
+    __kmpc_reduce_nowait                    187
+    __kmpc_end_reduce_nowait                188
+    __kmpc_reduce                           189
+    __kmpc_end_reduce                       190
+
+    __kmpc_omp_task_alloc                   191
+    __kmpc_omp_task                         192
+    __kmpc_omp_taskwait                     193
+    __kmpc_omp_task_begin_if0               196
+    __kmpc_omp_task_complete_if0            197
+    __kmpc_omp_task_parts                   198
+
+#   __omp_collector_api                  199
+
+    # These functions are for testing purposes. There is no need in stable ordinal number:
+    __kmp_get_reduce_method
+
+%endif  # not defined stub
+
+kmpc_calloc                                 200
+kmpc_free                                   201
+%ifndef stub
+    # These functions are exported from libguide, but declared neither in omp.h not in omp_lib.h.
+#    kmpc_get_banner                         202
+#    kmpc_get_poolmode                       203
+#    kmpc_get_poolsize                       204
+#    kmpc_get_poolstat                       205
+#    kmpc_poolprint                          207
+#    kmpc_print_banner                       208
+#    kmpc_set_poolmode                       214
+#    kmpc_set_poolsize                       215
+%endif
+kmpc_malloc                                 206
+kmpc_realloc                                209
+kmpc_set_blocktime                          211
+kmpc_set_library                            212
+# kmpc_set_parallel_name                      213
+kmpc_set_stacksize                          216
+kmpc_set_stacksize_s                        222
+# kmpc_set_stats                              217
+kmpc_set_defaults                           224
+
+%ifndef stub
+        __kmpc_for_static_init_8u           225
+        __kmpc_dispatch_init_8u             226
+        __kmpc_dispatch_next_8u             227
+        __kmpc_dispatch_fini_8u             228
+        __kmpc_for_static_init_4u           229
+        __kmpc_dispatch_init_4u             230
+        __kmpc_dispatch_next_4u             231
+        __kmpc_dispatch_fini_4u             232
+%endif
+
+%ifndef stub
+    __kmpc_get_taskid                       233
+    __kmpc_get_parent_taskid                234
+%endif
+
+%ifndef stub
+        __kmpc_omp_taskyield                235
+#    __kmpc_place_threads                    236
+%endif
+
+%ifndef stub
+        __kmpc_push_proc_bind               237
+        __kmpc_taskgroup                    238
+        __kmpc_end_taskgroup                239
+        __kmpc_push_num_teams               240
+        __kmpc_fork_teams                   241
+        __kmpc_omp_task_with_deps           242
+        __kmpc_omp_wait_deps                243
+        __kmpc_cancel                       244
+        __kmpc_cancellationpoint            245
+        __kmpc_cancel_barrier               246
+        __kmpc_dist_for_static_init_4       247
+        __kmpc_dist_for_static_init_4u      248
+        __kmpc_dist_for_static_init_8       249
+        __kmpc_dist_for_static_init_8u      250
+        __kmpc_dist_dispatch_init_4         251
+        __kmpc_dist_dispatch_init_4u        252
+        __kmpc_dist_dispatch_init_8         253
+        __kmpc_dist_dispatch_init_8u        254
+        __kmpc_team_static_init_4           255
+        __kmpc_team_static_init_4u          256
+        __kmpc_team_static_init_8           257
+        __kmpc_team_static_init_8u          258
+%endif
+
+%ifndef stub
+        __kmpc_proxy_task_completed         259
+        __kmpc_proxy_task_completed_ooo     260
+        __kmpc_doacross_init                261
+        __kmpc_doacross_wait                262
+        __kmpc_doacross_post                263
+        __kmpc_doacross_fini                264
+        __kmpc_taskloop                     266
+        __kmpc_critical_with_hint           270
+%endif
+kmpc_aligned_malloc                         265
+kmpc_set_disp_num_buffers                   267
+
+%ifndef stub
+        __kmpc_task_reduction_init          268
+        __kmpc_task_reduction_get_th_data   269
+# USED ABOVE __kmpc_critical_with_hint    270
+        __kmpc_get_target_offload           271
+        __kmpc_omp_reg_task_with_affinity   272
+        __kmpc_pause_resource               273
+        __kmpc_task_reduction_modifier_init 274
+        __kmpc_task_reduction_modifier_fini 275
+        __kmpc_task_allow_completion_event  276
+        __kmpc_taskred_init                 277
+        __kmpc_taskred_modifier_init        278
+%endif
+
+# User API entry points that have both lower- and upper- case versions for Fortran.
+# Number for lowercase version is indicated.  Number for uppercase is obtained by adding 1000.
+# User API entry points are entry points that start with 'kmp_' or 'omp_'.
+
+omp_destroy_lock                            700
+omp_destroy_nest_lock                       701
+omp_get_dynamic                             702
+omp_get_max_threads                         703
+omp_get_nested                              704
+omp_get_num_procs                           705
+omp_get_num_threads                         706
+omp_get_thread_num                          707
+omp_get_wtick                               708
+omp_get_wtime                               709
+omp_in_parallel                             710
+omp_init_lock                               711
+omp_init_nest_lock                          712
+omp_set_dynamic                             713
+omp_set_lock                                714
+omp_set_nest_lock                           715
+omp_set_nested                              716
+omp_set_num_threads                         717
+omp_test_lock                               718
+omp_test_nest_lock                          719
+omp_unset_lock                              720
+omp_unset_nest_lock                         721
+
+ompc_set_dynamic                            722
+ompc_set_nested                             723
+ompc_set_num_threads                        724
+
+kmp_calloc                                  725
+kmp_free                                    726
+kmp_get_blocktime                           727
+kmp_get_library                             728
+kmp_get_stacksize                           729
+kmp_malloc                                  730
+#kmp_print_banner                            731
+kmp_realloc                                 732
+kmp_set_blocktime                           734
+kmp_set_library                             735
+kmp_set_library_serial                      737
+kmp_set_library_throughput                  738
+kmp_set_library_turnaround                  739
+# kmp_set_parallel_name                       740
+kmp_set_stacksize                           741
+# kmp_set_stats                               742
+kmp_get_num_known_threads                   743
+kmp_set_stacksize_s                         744
+kmp_get_stacksize_s                         745
+kmp_set_defaults                            746
+kmp_aligned_malloc                          747
+kmp_set_warnings_on                         779
+kmp_set_warnings_off                        780
+
+    omp_get_active_level                    789
+    omp_get_level                           790
+    omp_get_ancestor_thread_num             791
+    omp_get_team_size                       792
+    omp_get_thread_limit                    793
+    omp_get_max_active_levels               794
+    omp_set_max_active_levels               795
+    omp_get_schedule                        796
+    omp_set_schedule                        797
+    ompc_set_max_active_levels              798
+    ompc_set_schedule                       799
+    ompc_get_ancestor_thread_num            800
+    ompc_get_team_size                      801
+    kmp_set_affinity                        850
+    kmp_get_affinity                        851
+    kmp_get_affinity_max_proc               852
+    kmp_create_affinity_mask                853
+    kmp_destroy_affinity_mask               854
+    kmp_set_affinity_mask_proc              855
+    kmpc_set_affinity_mask_proc             856
+    kmp_unset_affinity_mask_proc            857
+    kmpc_unset_affinity_mask_proc           858
+    kmp_get_affinity_mask_proc              859
+    kmpc_get_affinity_mask_proc             860
+
+    omp_in_final                            861
+
+    omp_get_proc_bind                       862
+   #omp_set_proc_bind                       863
+   #omp_curr_proc_bind                      864
+    omp_get_num_teams                       865
+    omp_get_team_num                        866
+    omp_get_cancellation                    867
+    kmp_get_cancellation_status             868
+    omp_is_initial_device                   869
+    omp_set_default_device                  879
+    omp_get_default_device                  880
+    omp_get_num_devices                     881
+
+    omp_init_lock_with_hint                 870
+    omp_init_nest_lock_with_hint            871
+    omp_get_max_task_priority               872
+    omp_get_num_places                      873
+    omp_get_place_num_procs                 874
+    omp_get_place_proc_ids                  875
+    omp_get_place_num                       876
+    omp_get_partition_num_places            877
+    omp_get_partition_place_nums            878
+    omp_get_initial_device                  882
+    %ifdef stub
+        omp_target_alloc                    883
+        omp_target_free                     884
+        omp_target_is_present               885
+        omp_target_memcpy                   886
+        omp_target_memcpy_rect              887
+        omp_target_associate_ptr            888
+        omp_target_disassociate_ptr         889
+    %endif
+
+kmp_set_disp_num_buffers                    890
+
+    omp_control_tool                        891
+    omp_set_default_allocator               892
+    omp_get_default_allocator               893
+    omp_alloc                               894
+    omp_free                                895
+    omp_get_device_num                      896
+    omp_init_allocator                      897
+    omp_destroy_allocator                   898
+    %ifndef stub
+        __kmpc_set_default_allocator
+        __kmpc_get_default_allocator
+        __kmpc_alloc
+        __kmpc_free
+        __kmpc_init_allocator
+        __kmpc_destroy_allocator
+    %endif
+    omp_set_affinity_format                 748
+    omp_get_affinity_format                 749
+    omp_display_affinity                    750
+    omp_capture_affinity                    751
+    ompc_set_affinity_format                752
+    ompc_get_affinity_format                753
+    ompc_display_affinity                   754
+    ompc_capture_affinity                   755
+    omp_pause_resource                      756
+    omp_pause_resource_all                  757
+    omp_get_supported_active_levels         758
+    omp_fulfill_event                       759
+
+    omp_null_allocator                     DATA
+    omp_default_mem_alloc                  DATA
+    omp_large_cap_mem_alloc                DATA
+    omp_const_mem_alloc                    DATA
+    omp_high_bw_mem_alloc                  DATA
+    omp_low_lat_mem_alloc                  DATA
+    omp_cgroup_mem_alloc                   DATA
+    omp_pteam_mem_alloc                    DATA
+    omp_thread_mem_alloc                   DATA
+
+    omp_default_mem_space                  DATA
+    omp_large_cap_mem_space                DATA
+    omp_const_mem_space                    DATA
+    omp_high_bw_mem_space                  DATA
+    omp_low_lat_mem_space                  DATA
+
+%ifndef stub
+    # Ordinals between 900 and 999 are reserved
+
+    # Ordinals between 1000 and 1999 are reserved
+    # for user-callable uppercase Fortran entries.
+
+
+    # ATOMIC entries
+
+    %ifdef HAVE_QUAD
+    __kmpc_atomic_cmplx16_div              2000
+    %endif
+
+    __kmpc_atomic_fixed1_add               2001
+    __kmpc_atomic_fixed1_andb              2002
+    __kmpc_atomic_fixed1_div               2003
+    __kmpc_atomic_fixed1u_div              2004
+    __kmpc_atomic_fixed1_mul               2005
+    __kmpc_atomic_fixed1_orb               2006
+    __kmpc_atomic_fixed1_shl               2007
+    __kmpc_atomic_fixed1_shr               2008
+    __kmpc_atomic_fixed1u_shr              2009
+    __kmpc_atomic_fixed1_sub               2010
+    __kmpc_atomic_fixed1_xor               2011
+
+    __kmpc_atomic_fixed2_add               2012
+    __kmpc_atomic_fixed2_andb              2013
+    __kmpc_atomic_fixed2_div               2014
+    __kmpc_atomic_fixed2u_div              2015
+    __kmpc_atomic_fixed2_mul               2016
+    __kmpc_atomic_fixed2_orb               2017
+    __kmpc_atomic_fixed2_shl               2018
+    __kmpc_atomic_fixed2_shr               2019
+    __kmpc_atomic_fixed2u_shr              2020
+    __kmpc_atomic_fixed2_sub               2021
+    __kmpc_atomic_fixed2_xor               2022
+
+   #__kmpc_atomic_fixed4_add           # declared above #102
+    __kmpc_atomic_fixed4_sub               2024
+   #__kmpc_atomic_float4_add           # declared above #104
+    __kmpc_atomic_float4_sub               2026
+   #__kmpc_atomic_fixed8_add           # declared above #103
+    __kmpc_atomic_fixed8_sub               2028
+   #__kmpc_atomic_float8_add           # declared above #105
+    __kmpc_atomic_float8_sub               2030
+
+    __kmpc_atomic_fixed4_andb              2031
+    __kmpc_atomic_fixed4_div               2032
+    __kmpc_atomic_fixed4u_div              2033
+    __kmpc_atomic_fixed4_mul               2034
+    __kmpc_atomic_fixed4_orb               2035
+    __kmpc_atomic_fixed4_shl               2036
+    __kmpc_atomic_fixed4_shr               2037
+    __kmpc_atomic_fixed4u_shr              2038
+    __kmpc_atomic_fixed4_xor               2039
+    __kmpc_atomic_fixed8_andb              2040
+    __kmpc_atomic_fixed8_div               2041
+    __kmpc_atomic_fixed8u_div              2042
+    __kmpc_atomic_fixed8_mul               2043
+    __kmpc_atomic_fixed8_orb               2044
+    __kmpc_atomic_fixed8_shl               2045
+    __kmpc_atomic_fixed8_shr               2046
+    __kmpc_atomic_fixed8u_shr              2047
+    __kmpc_atomic_fixed8_xor               2048
+    __kmpc_atomic_float4_div               2049
+    __kmpc_atomic_float4_mul               2050
+    __kmpc_atomic_float8_div               2051
+    __kmpc_atomic_float8_mul               2052
+
+    __kmpc_atomic_fixed1_andl              2053
+    __kmpc_atomic_fixed1_orl               2054
+    __kmpc_atomic_fixed2_andl              2055
+    __kmpc_atomic_fixed2_orl               2056
+    __kmpc_atomic_fixed4_andl              2057
+    __kmpc_atomic_fixed4_orl               2058
+    __kmpc_atomic_fixed8_andl              2059
+    __kmpc_atomic_fixed8_orl               2060
+
+    __kmpc_atomic_fixed1_max               2061
+    __kmpc_atomic_fixed1_min               2062
+    __kmpc_atomic_fixed2_max               2063
+    __kmpc_atomic_fixed2_min               2064
+    __kmpc_atomic_fixed4_max               2065
+    __kmpc_atomic_fixed4_min               2066
+    __kmpc_atomic_fixed8_max               2067
+    __kmpc_atomic_fixed8_min               2068
+    __kmpc_atomic_float4_max               2069
+    __kmpc_atomic_float4_min               2070
+    __kmpc_atomic_float8_max               2071
+    __kmpc_atomic_float8_min               2072
+
+    __kmpc_atomic_fixed1_neqv              2073
+    __kmpc_atomic_fixed2_neqv              2074
+    __kmpc_atomic_fixed4_neqv              2075
+    __kmpc_atomic_fixed8_neqv              2076
+    __kmpc_atomic_fixed1_eqv               2077
+    __kmpc_atomic_fixed2_eqv               2078
+    __kmpc_atomic_fixed4_eqv               2079
+    __kmpc_atomic_fixed8_eqv               2080
+
+    __kmpc_atomic_float10_add              2081
+    __kmpc_atomic_float10_sub              2082
+    __kmpc_atomic_float10_mul              2083
+    __kmpc_atomic_float10_div              2084
+
+    __kmpc_atomic_cmplx4_add               2085
+    __kmpc_atomic_cmplx4_sub               2086
+    __kmpc_atomic_cmplx4_mul               2087
+    __kmpc_atomic_cmplx4_div               2088
+    __kmpc_atomic_cmplx8_add               2089
+    __kmpc_atomic_cmplx8_sub               2090
+    __kmpc_atomic_cmplx8_mul               2091
+    __kmpc_atomic_cmplx8_div               2092
+    __kmpc_atomic_cmplx10_add              2093
+    __kmpc_atomic_cmplx10_sub              2094
+    __kmpc_atomic_cmplx10_mul              2095
+    __kmpc_atomic_cmplx10_div              2096
+    %ifdef HAVE_QUAD
+    __kmpc_atomic_cmplx16_add              2097
+    __kmpc_atomic_cmplx16_sub              2098
+    __kmpc_atomic_cmplx16_mul              2099
+   #__kmpc_atomic_cmplx16_div              2000 # moved up because of mistake in number (supposed to be 2100)
+
+    __kmpc_atomic_float16_add              2101
+    __kmpc_atomic_float16_sub              2102
+    __kmpc_atomic_float16_mul              2103
+    __kmpc_atomic_float16_div              2104
+    __kmpc_atomic_float16_max              2105
+    __kmpc_atomic_float16_min              2106
+
+    __kmpc_atomic_fixed1_add_fp            2107
+    __kmpc_atomic_fixed1_sub_fp            2108
+    __kmpc_atomic_fixed1_mul_fp            2109
+    __kmpc_atomic_fixed1_div_fp            2110
+    __kmpc_atomic_fixed1u_div_fp           2111
+
+    __kmpc_atomic_fixed2_add_fp            2112
+    __kmpc_atomic_fixed2_sub_fp            2113
+    __kmpc_atomic_fixed2_mul_fp            2114
+    __kmpc_atomic_fixed2_div_fp            2115
+    __kmpc_atomic_fixed2u_div_fp           2116
+
+    __kmpc_atomic_fixed4_add_fp            2117
+    __kmpc_atomic_fixed4_sub_fp            2118
+    __kmpc_atomic_fixed4_mul_fp            2119
+    __kmpc_atomic_fixed4_div_fp            2120
+    __kmpc_atomic_fixed4u_div_fp           2121
+
+    __kmpc_atomic_fixed8_add_fp            2122
+    __kmpc_atomic_fixed8_sub_fp            2123
+    __kmpc_atomic_fixed8_mul_fp            2124
+    __kmpc_atomic_fixed8_div_fp            2125
+    __kmpc_atomic_fixed8u_div_fp           2126
+
+    __kmpc_atomic_float4_add_fp            2127
+    __kmpc_atomic_float4_sub_fp            2128
+    __kmpc_atomic_float4_mul_fp            2129
+    __kmpc_atomic_float4_div_fp            2130
+
+    __kmpc_atomic_float8_add_fp            2131
+    __kmpc_atomic_float8_sub_fp            2132
+    __kmpc_atomic_float8_mul_fp            2133
+    __kmpc_atomic_float8_div_fp            2134
+
+    __kmpc_atomic_float10_add_fp           2135
+    __kmpc_atomic_float10_sub_fp           2136
+    __kmpc_atomic_float10_mul_fp           2137
+    __kmpc_atomic_float10_div_fp           2138
+    %endif
+
+    __kmpc_atomic_fixed1_mul_float8        2169
+    __kmpc_atomic_fixed1_div_float8        2170
+
+    __kmpc_atomic_fixed2_mul_float8        2174
+    __kmpc_atomic_fixed2_div_float8        2175
+
+    __kmpc_atomic_fixed4_mul_float8        2179
+    __kmpc_atomic_fixed4_div_float8        2180
+
+    __kmpc_atomic_fixed8_mul_float8        2184
+    __kmpc_atomic_fixed8_div_float8        2185
+
+    __kmpc_atomic_float4_add_float8        2187
+    __kmpc_atomic_float4_sub_float8        2188
+    __kmpc_atomic_float4_mul_float8        2189
+    __kmpc_atomic_float4_div_float8        2190
+
+    __kmpc_atomic_cmplx4_add_cmplx8        2231
+    __kmpc_atomic_cmplx4_sub_cmplx8        2232
+    __kmpc_atomic_cmplx4_mul_cmplx8        2233
+    __kmpc_atomic_cmplx4_div_cmplx8        2234
+
+    __kmpc_atomic_1                        2247
+    __kmpc_atomic_2                        2248
+   #__kmpc_atomic_4                    # declared above #100
+   #__kmpc_atomic_8                    # declared above #101
+    __kmpc_atomic_10                       2251
+    __kmpc_atomic_16                       2252
+    __kmpc_atomic_20                       2253
+    __kmpc_atomic_32                       2254
+
+    %ifdef arch_32
+
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_add_a16      2255
+        __kmpc_atomic_float16_sub_a16      2256
+        __kmpc_atomic_float16_mul_a16      2257
+        __kmpc_atomic_float16_div_a16      2258
+        __kmpc_atomic_float16_max_a16      2259
+        __kmpc_atomic_float16_min_a16      2260
+
+        __kmpc_atomic_cmplx16_add_a16      2261
+        __kmpc_atomic_cmplx16_sub_a16      2262
+        __kmpc_atomic_cmplx16_mul_a16      2263
+        __kmpc_atomic_cmplx16_div_a16      2264
+        %endif
+
+    %endif
+
+    %ifndef arch_64
+
+        # ATOMIC extensions for OpenMP 3.1 spec (x86 and x64 only)
+
+        __kmpc_atomic_fixed1_rd                2265
+        __kmpc_atomic_fixed2_rd                2266
+        __kmpc_atomic_fixed4_rd                2267
+        __kmpc_atomic_fixed8_rd                2268
+        __kmpc_atomic_float4_rd                2269
+        __kmpc_atomic_float8_rd                2270
+        __kmpc_atomic_float10_rd               2271
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_rd               2272
+        %endif
+        __kmpc_atomic_cmplx4_rd                2273
+        __kmpc_atomic_cmplx8_rd                2274
+        __kmpc_atomic_cmplx10_rd               2275
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_cmplx16_rd               2276
+            %ifdef arch_32
+                __kmpc_atomic_float16_a16_rd       2277
+                __kmpc_atomic_cmplx16_a16_rd       2278
+            %endif
+        %endif
+        __kmpc_atomic_fixed1_wr                2279
+        __kmpc_atomic_fixed2_wr                2280
+        __kmpc_atomic_fixed4_wr                2281
+        __kmpc_atomic_fixed8_wr                2282
+        __kmpc_atomic_float4_wr                2283
+        __kmpc_atomic_float8_wr                2284
+        __kmpc_atomic_float10_wr               2285
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_wr               2286
+        %endif
+        __kmpc_atomic_cmplx4_wr                2287
+        __kmpc_atomic_cmplx8_wr                2288
+        __kmpc_atomic_cmplx10_wr               2289
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_cmplx16_wr               2290
+        %ifdef arch_32
+            __kmpc_atomic_float16_a16_wr       2291
+            __kmpc_atomic_cmplx16_a16_wr       2292
+        %endif
+        %endif
+        __kmpc_atomic_fixed1_add_cpt           2293
+        __kmpc_atomic_fixed1_andb_cpt          2294
+        __kmpc_atomic_fixed1_div_cpt           2295
+        __kmpc_atomic_fixed1u_div_cpt          2296
+        __kmpc_atomic_fixed1_mul_cpt           2297
+        __kmpc_atomic_fixed1_orb_cpt           2298
+        __kmpc_atomic_fixed1_shl_cpt           2299
+        __kmpc_atomic_fixed1_shr_cpt           2300
+        __kmpc_atomic_fixed1u_shr_cpt          2301
+        __kmpc_atomic_fixed1_sub_cpt           2302
+        __kmpc_atomic_fixed1_xor_cpt           2303
+        __kmpc_atomic_fixed2_add_cpt           2304
+        __kmpc_atomic_fixed2_andb_cpt          2305
+        __kmpc_atomic_fixed2_div_cpt           2306
+        __kmpc_atomic_fixed2u_div_cpt          2307
+        __kmpc_atomic_fixed2_mul_cpt           2308
+        __kmpc_atomic_fixed2_orb_cpt           2309
+        __kmpc_atomic_fixed2_shl_cpt           2310
+        __kmpc_atomic_fixed2_shr_cpt           2311
+        __kmpc_atomic_fixed2u_shr_cpt          2312
+        __kmpc_atomic_fixed2_sub_cpt           2313
+        __kmpc_atomic_fixed2_xor_cpt           2314
+        __kmpc_atomic_fixed4_add_cpt           2315
+        __kmpc_atomic_fixed4_sub_cpt           2316
+        __kmpc_atomic_float4_add_cpt           2317
+        __kmpc_atomic_float4_sub_cpt           2318
+        __kmpc_atomic_fixed8_add_cpt           2319
+        __kmpc_atomic_fixed8_sub_cpt           2320
+        __kmpc_atomic_float8_add_cpt           2321
+        __kmpc_atomic_float8_sub_cpt           2322
+        __kmpc_atomic_fixed4_andb_cpt          2323
+        __kmpc_atomic_fixed4_div_cpt           2324
+        __kmpc_atomic_fixed4u_div_cpt          2325
+        __kmpc_atomic_fixed4_mul_cpt           2326
+        __kmpc_atomic_fixed4_orb_cpt           2327
+        __kmpc_atomic_fixed4_shl_cpt           2328
+        __kmpc_atomic_fixed4_shr_cpt           2329
+        __kmpc_atomic_fixed4u_shr_cpt          2330
+        __kmpc_atomic_fixed4_xor_cpt           2331
+        __kmpc_atomic_fixed8_andb_cpt          2332
+        __kmpc_atomic_fixed8_div_cpt           2333
+        __kmpc_atomic_fixed8u_div_cpt          2334
+        __kmpc_atomic_fixed8_mul_cpt           2335
+        __kmpc_atomic_fixed8_orb_cpt           2336
+        __kmpc_atomic_fixed8_shl_cpt           2337
+        __kmpc_atomic_fixed8_shr_cpt           2338
+        __kmpc_atomic_fixed8u_shr_cpt          2339
+        __kmpc_atomic_fixed8_xor_cpt           2340
+        __kmpc_atomic_float4_div_cpt           2341
+        __kmpc_atomic_float4_mul_cpt           2342
+        __kmpc_atomic_float8_div_cpt           2343
+        __kmpc_atomic_float8_mul_cpt           2344
+        __kmpc_atomic_fixed1_andl_cpt          2345
+        __kmpc_atomic_fixed1_orl_cpt           2346
+        __kmpc_atomic_fixed2_andl_cpt          2347
+        __kmpc_atomic_fixed2_orl_cpt           2348
+        __kmpc_atomic_fixed4_andl_cpt          2349
+        __kmpc_atomic_fixed4_orl_cpt           2350
+        __kmpc_atomic_fixed8_andl_cpt          2351
+        __kmpc_atomic_fixed8_orl_cpt           2352
+        __kmpc_atomic_fixed1_max_cpt           2353
+        __kmpc_atomic_fixed1_min_cpt           2354
+        __kmpc_atomic_fixed2_max_cpt           2355
+        __kmpc_atomic_fixed2_min_cpt           2356
+        __kmpc_atomic_fixed4_max_cpt           2357
+        __kmpc_atomic_fixed4_min_cpt           2358
+        __kmpc_atomic_fixed8_max_cpt           2359
+        __kmpc_atomic_fixed8_min_cpt           2360
+        __kmpc_atomic_float4_max_cpt           2361
+        __kmpc_atomic_float4_min_cpt           2362
+        __kmpc_atomic_float8_max_cpt           2363
+        __kmpc_atomic_float8_min_cpt           2364
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_max_cpt          2365
+        __kmpc_atomic_float16_min_cpt          2366
+        %endif
+        __kmpc_atomic_fixed1_neqv_cpt          2367
+        __kmpc_atomic_fixed2_neqv_cpt          2368
+        __kmpc_atomic_fixed4_neqv_cpt          2369
+        __kmpc_atomic_fixed8_neqv_cpt          2370
+        __kmpc_atomic_fixed1_eqv_cpt           2371
+        __kmpc_atomic_fixed2_eqv_cpt           2372
+        __kmpc_atomic_fixed4_eqv_cpt           2373
+        __kmpc_atomic_fixed8_eqv_cpt           2374
+        __kmpc_atomic_float10_add_cpt          2375
+        __kmpc_atomic_float10_sub_cpt          2376
+        __kmpc_atomic_float10_mul_cpt          2377
+        __kmpc_atomic_float10_div_cpt          2378
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_float16_add_cpt          2379
+        __kmpc_atomic_float16_sub_cpt          2380
+        __kmpc_atomic_float16_mul_cpt          2381
+        __kmpc_atomic_float16_div_cpt          2382
+        %endif
+        __kmpc_atomic_cmplx4_add_cpt           2383
+        __kmpc_atomic_cmplx4_sub_cpt           2384
+        __kmpc_atomic_cmplx4_mul_cpt           2385
+        __kmpc_atomic_cmplx4_div_cpt           2386
+        __kmpc_atomic_cmplx8_add_cpt           2387
+        __kmpc_atomic_cmplx8_sub_cpt           2388
+        __kmpc_atomic_cmplx8_mul_cpt           2389
+        __kmpc_atomic_cmplx8_div_cpt           2390
+        __kmpc_atomic_cmplx10_add_cpt          2391
+        __kmpc_atomic_cmplx10_sub_cpt          2392
+        __kmpc_atomic_cmplx10_mul_cpt          2393
+        __kmpc_atomic_cmplx10_div_cpt          2394
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_cmplx16_add_cpt          2395
+        __kmpc_atomic_cmplx16_sub_cpt          2396
+        __kmpc_atomic_cmplx16_mul_cpt          2397
+        __kmpc_atomic_cmplx16_div_cpt          2398
+        %endif
+       #__kmpc_atomic_cmplx4_add_cpt_tmp            2409
+
+        %ifdef arch_32
+        %ifdef HAVE_QUAD
+            __kmpc_atomic_float16_add_a16_cpt  2399
+            __kmpc_atomic_float16_sub_a16_cpt  2400
+            __kmpc_atomic_float16_mul_a16_cpt  2401
+            __kmpc_atomic_float16_div_a16_cpt  2402
+            __kmpc_atomic_float16_max_a16_cpt  2403
+            __kmpc_atomic_float16_min_a16_cpt  2404
+            __kmpc_atomic_cmplx16_add_a16_cpt  2405
+            __kmpc_atomic_cmplx16_sub_a16_cpt  2406
+            __kmpc_atomic_cmplx16_mul_a16_cpt  2407
+            __kmpc_atomic_cmplx16_div_a16_cpt  2408
+        %endif
+        %endif
+
+        __kmpc_atomic_start                    2410
+        __kmpc_atomic_end                      2411
+
+        %ifdef HAVE_QUAD
+        __kmpc_atomic_fixed1_add_cpt_fp
+        __kmpc_atomic_fixed1_sub_cpt_fp
+        __kmpc_atomic_fixed1_mul_cpt_fp
+        __kmpc_atomic_fixed1_div_cpt_fp
+        __kmpc_atomic_fixed1u_add_cpt_fp
+        __kmpc_atomic_fixed1u_sub_cpt_fp
+        __kmpc_atomic_fixed1u_mul_cpt_fp
+        __kmpc_atomic_fixed1u_div_cpt_fp
+
+        __kmpc_atomic_fixed2_add_cpt_fp
+        __kmpc_atomic_fixed2_sub_cpt_fp
+        __kmpc_atomic_fixed2_mul_cpt_fp
+        __kmpc_atomic_fixed2_div_cpt_fp
+        __kmpc_atomic_fixed2u_add_cpt_fp
+        __kmpc_atomic_fixed2u_sub_cpt_fp
+        __kmpc_atomic_fixed2u_mul_cpt_fp
+        __kmpc_atomic_fixed2u_div_cpt_fp
+
+        __kmpc_atomic_fixed4_add_cpt_fp
+        __kmpc_atomic_fixed4_sub_cpt_fp
+        __kmpc_atomic_fixed4_mul_cpt_fp
+        __kmpc_atomic_fixed4_div_cpt_fp
+        __kmpc_atomic_fixed4u_add_cpt_fp
+        __kmpc_atomic_fixed4u_sub_cpt_fp
+        __kmpc_atomic_fixed4u_mul_cpt_fp
+        __kmpc_atomic_fixed4u_div_cpt_fp
+
+        __kmpc_atomic_fixed8_add_cpt_fp
+        __kmpc_atomic_fixed8_sub_cpt_fp
+        __kmpc_atomic_fixed8_mul_cpt_fp
+        __kmpc_atomic_fixed8_div_cpt_fp
+        __kmpc_atomic_fixed8u_add_cpt_fp
+        __kmpc_atomic_fixed8u_sub_cpt_fp
+        __kmpc_atomic_fixed8u_mul_cpt_fp
+        __kmpc_atomic_fixed8u_div_cpt_fp
+
+        __kmpc_atomic_float4_add_cpt_fp
+        __kmpc_atomic_float4_sub_cpt_fp
+        __kmpc_atomic_float4_mul_cpt_fp
+        __kmpc_atomic_float4_div_cpt_fp
+
+        __kmpc_atomic_float8_add_cpt_fp
+        __kmpc_atomic_float8_sub_cpt_fp
+        __kmpc_atomic_float8_mul_cpt_fp
+        __kmpc_atomic_float8_div_cpt_fp
+
+        __kmpc_atomic_float10_add_cpt_fp
+        __kmpc_atomic_float10_sub_cpt_fp
+        __kmpc_atomic_float10_mul_cpt_fp
+        __kmpc_atomic_float10_div_cpt_fp
+        %endif
+
+            # ATOMIC extensions for OpenMP 4.0 spec (x86 and x64 only)
+
+            __kmpc_atomic_fixed1_swp           2412
+            __kmpc_atomic_fixed2_swp           2413
+            __kmpc_atomic_fixed4_swp           2414
+            __kmpc_atomic_fixed8_swp           2415
+            __kmpc_atomic_float4_swp           2416
+            __kmpc_atomic_float8_swp           2417
+            __kmpc_atomic_float10_swp          2418
+            %ifdef HAVE_QUAD
+              __kmpc_atomic_float16_swp        2419
+            %endif
+            __kmpc_atomic_cmplx4_swp           2420
+            __kmpc_atomic_cmplx8_swp           2421
+            __kmpc_atomic_cmplx10_swp          2422
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_cmplx16_swp          2423
+
+            %ifdef arch_32
+                __kmpc_atomic_float16_a16_swp  2424
+                __kmpc_atomic_cmplx16_a16_swp  2425
+            %endif
+            %endif
+
+            __kmpc_atomic_fixed1_sub_cpt_rev   2426
+            __kmpc_atomic_fixed1_div_cpt_rev   2427
+            __kmpc_atomic_fixed1u_div_cpt_rev  2428
+            __kmpc_atomic_fixed1_shl_cpt_rev   2429
+            __kmpc_atomic_fixed1_shr_cpt_rev   2430
+            __kmpc_atomic_fixed1u_shr_cpt_rev  2431
+            __kmpc_atomic_fixed2_sub_cpt_rev   2432
+            __kmpc_atomic_fixed2_div_cpt_rev   2433
+            __kmpc_atomic_fixed2u_div_cpt_rev  2434
+            __kmpc_atomic_fixed2_shl_cpt_rev   2435
+            __kmpc_atomic_fixed2_shr_cpt_rev   2436
+            __kmpc_atomic_fixed2u_shr_cpt_rev  2437
+            __kmpc_atomic_fixed4_sub_cpt_rev   2438
+            __kmpc_atomic_fixed4_div_cpt_rev   2439
+            __kmpc_atomic_fixed4u_div_cpt_rev  2440
+            __kmpc_atomic_fixed4_shl_cpt_rev   2441
+            __kmpc_atomic_fixed4_shr_cpt_rev   2442
+            __kmpc_atomic_fixed4u_shr_cpt_rev  2443
+            __kmpc_atomic_fixed8_sub_cpt_rev   2444
+            __kmpc_atomic_fixed8_div_cpt_rev   2445
+            __kmpc_atomic_fixed8u_div_cpt_rev  2446
+            __kmpc_atomic_fixed8_shl_cpt_rev   2447
+            __kmpc_atomic_fixed8_shr_cpt_rev   2448
+            __kmpc_atomic_fixed8u_shr_cpt_rev  2449
+            __kmpc_atomic_float4_sub_cpt_rev   2450
+            __kmpc_atomic_float4_div_cpt_rev   2451
+            __kmpc_atomic_float8_sub_cpt_rev   2452
+            __kmpc_atomic_float8_div_cpt_rev   2453
+            __kmpc_atomic_float10_sub_cpt_rev  2454
+            __kmpc_atomic_float10_div_cpt_rev  2455
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_float16_sub_cpt_rev  2456
+            __kmpc_atomic_float16_div_cpt_rev  2457
+            %endif
+            __kmpc_atomic_cmplx4_sub_cpt_rev   2458
+            __kmpc_atomic_cmplx4_div_cpt_rev   2459
+            __kmpc_atomic_cmplx8_sub_cpt_rev   2460
+            __kmpc_atomic_cmplx8_div_cpt_rev   2461
+            __kmpc_atomic_cmplx10_sub_cpt_rev  2462
+            __kmpc_atomic_cmplx10_div_cpt_rev  2463
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_cmplx16_sub_cpt_rev  2464
+            __kmpc_atomic_cmplx16_div_cpt_rev  2465
+
+            %ifdef arch_32
+                __kmpc_atomic_float16_sub_a16_cpt_rev  2466
+                __kmpc_atomic_float16_div_a16_cpt_rev  2467
+                __kmpc_atomic_cmplx16_sub_a16_cpt_rev  2468
+                __kmpc_atomic_cmplx16_div_a16_cpt_rev  2469
+            %endif
+            %endif
+
+            __kmpc_atomic_fixed1_sub_rev   2470
+            __kmpc_atomic_fixed1_div_rev   2471
+            __kmpc_atomic_fixed1u_div_rev  2472
+            __kmpc_atomic_fixed1_shl_rev   2473
+            __kmpc_atomic_fixed1_shr_rev   2474
+            __kmpc_atomic_fixed1u_shr_rev  2475
+            __kmpc_atomic_fixed2_sub_rev   2476
+            __kmpc_atomic_fixed2_div_rev   2477
+            __kmpc_atomic_fixed2u_div_rev  2478
+            __kmpc_atomic_fixed2_shl_rev   2479
+            __kmpc_atomic_fixed2_shr_rev   2480
+            __kmpc_atomic_fixed2u_shr_rev  2481
+            __kmpc_atomic_fixed4_sub_rev   2482
+            __kmpc_atomic_fixed4_div_rev   2483
+            __kmpc_atomic_fixed4u_div_rev  2484
+            __kmpc_atomic_fixed4_shl_rev   2485
+            __kmpc_atomic_fixed4_shr_rev   2486
+            __kmpc_atomic_fixed4u_shr_rev  2487
+            __kmpc_atomic_fixed8_sub_rev   2488
+            __kmpc_atomic_fixed8_div_rev   2489
+            __kmpc_atomic_fixed8u_div_rev  2490
+            __kmpc_atomic_fixed8_shl_rev   2491
+            __kmpc_atomic_fixed8_shr_rev   2492
+            __kmpc_atomic_fixed8u_shr_rev  2493
+            __kmpc_atomic_float4_sub_rev   2494
+            __kmpc_atomic_float4_div_rev   2495
+            __kmpc_atomic_float8_sub_rev   2496
+            __kmpc_atomic_float8_div_rev   2497
+            __kmpc_atomic_float10_sub_rev  2498
+            __kmpc_atomic_float10_div_rev  2499
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_float16_sub_rev  2500
+            __kmpc_atomic_float16_div_rev  2501
+	    %endif
+            __kmpc_atomic_cmplx4_sub_rev   2502
+            __kmpc_atomic_cmplx4_div_rev   2503
+            __kmpc_atomic_cmplx8_sub_rev   2504
+            __kmpc_atomic_cmplx8_div_rev   2505
+            __kmpc_atomic_cmplx10_sub_rev  2506
+            __kmpc_atomic_cmplx10_div_rev  2507
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_cmplx16_sub_rev  2508
+            __kmpc_atomic_cmplx16_div_rev  2509
+            %ifdef arch_32
+                __kmpc_atomic_float16_sub_a16_rev  2510
+                __kmpc_atomic_float16_div_a16_rev  2511
+                __kmpc_atomic_cmplx16_sub_a16_rev  2512
+                __kmpc_atomic_cmplx16_div_a16_rev  2513
+            %endif
+            %endif
+
+            %ifdef HAVE_QUAD
+            __kmpc_atomic_fixed1_sub_rev_fp
+            __kmpc_atomic_fixed1u_sub_rev_fp
+            __kmpc_atomic_fixed1_div_rev_fp
+            __kmpc_atomic_fixed1u_div_rev_fp
+            __kmpc_atomic_fixed2_sub_rev_fp
+            __kmpc_atomic_fixed2u_sub_rev_fp
+            __kmpc_atomic_fixed2_div_rev_fp
+            __kmpc_atomic_fixed2u_div_rev_fp
+            __kmpc_atomic_fixed4_sub_rev_fp
+            __kmpc_atomic_fixed4u_sub_rev_fp
+            __kmpc_atomic_fixed4_div_rev_fp
+            __kmpc_atomic_fixed4u_div_rev_fp
+            __kmpc_atomic_fixed8_sub_rev_fp
+            __kmpc_atomic_fixed8u_sub_rev_fp
+            __kmpc_atomic_fixed8_div_rev_fp
+            __kmpc_atomic_fixed8u_div_rev_fp
+            __kmpc_atomic_float4_sub_rev_fp
+            __kmpc_atomic_float4_div_rev_fp
+            __kmpc_atomic_float8_sub_rev_fp
+            __kmpc_atomic_float8_div_rev_fp
+            __kmpc_atomic_float10_sub_rev_fp
+            __kmpc_atomic_float10_div_rev_fp
+
+            __kmpc_atomic_fixed1_sub_cpt_rev_fp
+            __kmpc_atomic_fixed1u_sub_cpt_rev_fp
+            __kmpc_atomic_fixed1_div_cpt_rev_fp
+            __kmpc_atomic_fixed1u_div_cpt_rev_fp
+            __kmpc_atomic_fixed2_sub_cpt_rev_fp
+            __kmpc_atomic_fixed2u_sub_cpt_rev_fp
+            __kmpc_atomic_fixed2_div_cpt_rev_fp
+            __kmpc_atomic_fixed2u_div_cpt_rev_fp
+            __kmpc_atomic_fixed4_sub_cpt_rev_fp
+            __kmpc_atomic_fixed4u_sub_cpt_rev_fp
+            __kmpc_atomic_fixed4_div_cpt_rev_fp
+            __kmpc_atomic_fixed4u_div_cpt_rev_fp
+            __kmpc_atomic_fixed8_sub_cpt_rev_fp
+            __kmpc_atomic_fixed8u_sub_cpt_rev_fp
+            __kmpc_atomic_fixed8_div_cpt_rev_fp
+            __kmpc_atomic_fixed8u_div_cpt_rev_fp
+            __kmpc_atomic_float4_sub_cpt_rev_fp
+            __kmpc_atomic_float4_div_cpt_rev_fp
+            __kmpc_atomic_float8_sub_cpt_rev_fp
+            __kmpc_atomic_float8_div_cpt_rev_fp
+            __kmpc_atomic_float10_sub_cpt_rev_fp
+            __kmpc_atomic_float10_div_cpt_rev_fp
+            %endif
+
+    %endif   # arch_64
+
+    %ifdef HAVE_QUAD
+    __kmpc_atomic_fixed1u_add_fp
+    __kmpc_atomic_fixed1u_sub_fp
+    __kmpc_atomic_fixed1u_mul_fp
+    __kmpc_atomic_fixed2u_add_fp
+    __kmpc_atomic_fixed2u_sub_fp
+    __kmpc_atomic_fixed2u_mul_fp
+    __kmpc_atomic_fixed4u_add_fp
+    __kmpc_atomic_fixed4u_sub_fp
+    __kmpc_atomic_fixed4u_mul_fp
+    __kmpc_atomic_fixed8u_add_fp
+    __kmpc_atomic_fixed8u_sub_fp
+    __kmpc_atomic_fixed8u_mul_fp
+    %endif
+
+%endif
+
+# end of file #
diff --git a/final/runtime/src/exports_so.txt b/final/runtime/src/exports_so.txt
new file mode 100644
index 0000000..f7de5fd
--- /dev/null
+++ b/final/runtime/src/exports_so.txt
@@ -0,0 +1,123 @@
+# exports_so.txt #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# This is version script for OMP RTL shared library (libomp*.so)
+
+VERSION {
+
+    global: # Exported symbols.
+
+        #
+        # "Normal" symbols.
+        #
+        omp_*;     # Standard OpenMP functions.
+        OMP_*;     # Standard OpenMP symbols.
+
+        #
+        # OMPT API
+        #
+        ompt_start_tool;     # OMPT start interface
+
+        # icc drops weak attribute at linking step without the following line:
+        Annotate*;           # TSAN annotation
+
+        ompc_*;    # omp.h renames some standard functions to ompc_*.
+        kmp_*;     # Intel extensions.
+        kmpc_*;    # Intel extensions.
+        __kmpc_*;  # Functions called by compiler-generated code.
+        GOMP_*;    # GNU C compatibility functions.
+
+        _You_must_link_with_*;     # Mutual detection/MS compatibility symbols.
+
+        #
+        # Debugger support.
+        #
+#if USE_DEBUGGER
+        __kmp_debugging;
+        __kmp_omp_debug_struct_info;
+#endif /* USE_DEBUGGER */
+
+        #
+        # Internal functions exported for testing purposes.
+        #
+        __kmp_get_reduce_method;
+        ___kmp_allocate;
+        ___kmp_free;
+        __kmp_thread_pool;
+
+	__kmp_reset_stats;
+
+#if USE_ITT_BUILD
+        #
+        # ITT support.
+        #
+        # The following entry points are added so that the backtraces from
+        # the tools contain meaningful names for all the functions that might
+        # appear in a backtrace of a thread which is blocked in the RTL.
+        __kmp_acquire_drdpa_lock;
+        __kmp_acquire_nested_drdpa_lock;
+        __kmp_acquire_nested_queuing_lock;
+        __kmp_acquire_nested_tas_lock;
+        __kmp_acquire_nested_ticket_lock;
+        __kmp_acquire_queuing_lock;
+        __kmp_acquire_tas_lock;
+        __kmp_acquire_ticket_lock;
+        __kmp_fork_call;
+        __kmp_invoke_microtask;
+#if KMP_USE_MONITOR
+        __kmp_launch_monitor;
+        __kmp_reap_monitor;
+#endif
+        __kmp_launch_worker;
+        __kmp_reap_worker;
+        __kmp_release_64;
+        __kmp_wait_64;
+        __kmp_wait_4;
+
+        # ittnotify symbols to be used by debugger
+        __kmp_itt_fini_ittlib;
+        __kmp_itt_init_ittlib;
+#endif /* USE_ITT_BUILD */
+
+    local: # Non-exported symbols.
+
+        *;         # All other symbols are not exported.
+
+}; # VERSION
+
+# sets up GCC OMP_ version dependency chain
+OMP_1.0 {
+};
+OMP_2.0 {
+} OMP_1.0;
+OMP_3.0 {
+} OMP_2.0;
+OMP_3.1 {
+} OMP_3.0;
+OMP_4.0 {
+} OMP_3.1;
+OMP_4.5 {
+} OMP_4.0;
+
+# sets up GCC GOMP_ version dependency chain
+GOMP_1.0 {
+};
+GOMP_2.0 {
+} GOMP_1.0;
+GOMP_3.0 {
+} GOMP_2.0;
+GOMP_4.0 {
+} GOMP_3.0;
+GOMP_4.5 {
+} GOMP_4.0;
+
+# end of file #
diff --git a/final/runtime/src/extractExternal.cpp b/final/runtime/src/extractExternal.cpp
new file mode 100644
index 0000000..b3e55b5
--- /dev/null
+++ b/final/runtime/src/extractExternal.cpp
@@ -0,0 +1,483 @@
+/*
+ * extractExternal.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <set>
+#include <stdlib.h>
+#include <string>
+#include <strstream>
+
+/* Given a set of n object files h ('external' object files) and a set of m
+   object files o ('internal' object files),
+   1. Determines r, the subset of h that o depends on, directly or indirectly
+   2. Removes the files in h - r from the file system
+   3. For each external symbol defined in some file in r, rename it in r U o
+      by prefixing it with "__kmp_external_"
+   Usage:
+   hide.exe <n> <filenames for h> <filenames for o>
+
+   Thus, the prefixed symbols become hidden in the sense that they now have a
+   special prefix.
+*/
+
+using namespace std;
+
+void stop(char *errorMsg) {
+  printf("%s\n", errorMsg);
+  exit(1);
+}
+
+// an entry in the symbol table of a .OBJ file
+class Symbol {
+public:
+  __int64 name;
+  unsigned value;
+  unsigned short sectionNum, type;
+  char storageClass, nAux;
+};
+
+class _rstream : public istrstream {
+private:
+  const char *buf;
+
+protected:
+  _rstream(pair<const char *, streamsize> p)
+      : istrstream(p.first, p.second), buf(p.first) {}
+  ~_rstream() { delete[] buf; }
+};
+
+// A stream encapuslating the content of a file or the content of a string,
+// overriding the >> operator to read various integer types in binary form,
+// as well as a symbol table entry.
+class rstream : public _rstream {
+private:
+  template <class T> inline rstream &doRead(T &x) {
+    read((char *)&x, sizeof(T));
+    return *this;
+  }
+  static pair<const char *, streamsize> getBuf(const char *fileName) {
+    ifstream raw(fileName, ios::binary | ios::in);
+    if (!raw.is_open())
+      stop("rstream.getBuf: Error opening file");
+    raw.seekg(0, ios::end);
+    streampos fileSize = raw.tellg();
+    if (fileSize < 0)
+      stop("rstream.getBuf: Error reading file");
+    char *buf = new char[fileSize];
+    raw.seekg(0, ios::beg);
+    raw.read(buf, fileSize);
+    return pair<const char *, streamsize>(buf, fileSize);
+  }
+
+public:
+  // construct from a string
+  rstream(const char *buf, streamsize size)
+      : _rstream(pair<const char *, streamsize>(buf, size)) {}
+  // construct from a file whole content is fully read once to initialize the
+  // content of this stream
+  rstream(const char *fileName) : _rstream(getBuf(fileName)) {}
+  rstream &operator>>(int &x) { return doRead(x); }
+  rstream &operator>>(unsigned &x) { return doRead(x); }
+  rstream &operator>>(short &x) { return doRead(x); }
+  rstream &operator>>(unsigned short &x) { return doRead(x); }
+  rstream &operator>>(Symbol &e) {
+    read((char *)&e, 18);
+    return *this;
+  }
+};
+
+// string table in a .OBJ file
+class StringTable {
+private:
+  map<string, unsigned> directory;
+  size_t length;
+  char *data;
+
+  // make <directory> from <length> bytes in <data>
+  void makeDirectory(void) {
+    unsigned i = 4;
+    while (i < length) {
+      string s = string(data + i);
+      directory.insert(make_pair(s, i));
+      i += s.size() + 1;
+    }
+  }
+  // initialize <length> and <data> with contents specified by the arguments
+  void init(const char *_data) {
+    unsigned _length = *(unsigned *)_data;
+
+    if (_length < sizeof(unsigned) || _length != *(unsigned *)_data)
+      stop("StringTable.init: Invalid symbol table");
+    if (_data[_length - 1]) {
+      // to prevent runaway strings, make sure the data ends with a zero
+      data = new char[length = _length + 1];
+      data[_length] = 0;
+    } else {
+      data = new char[length = _length];
+    }
+    *(unsigned *)data = length;
+    KMP_MEMCPY(data + sizeof(unsigned), _data + sizeof(unsigned),
+               length - sizeof(unsigned));
+    makeDirectory();
+  }
+
+public:
+  StringTable(rstream &f) {
+    // Construct string table by reading from f.
+    streampos s;
+    unsigned strSize;
+    char *strData;
+
+    s = f.tellg();
+    f >> strSize;
+    if (strSize < sizeof(unsigned))
+      stop("StringTable: Invalid string table");
+    strData = new char[strSize];
+    *(unsigned *)strData = strSize;
+    // read the raw data into <strData>
+    f.read(strData + sizeof(unsigned), strSize - sizeof(unsigned));
+    s = f.tellg() - s;
+    if (s < strSize)
+      stop("StringTable: Unexpected EOF");
+    init(strData);
+    delete[] strData;
+  }
+  StringTable(const set<string> &strings) {
+    // Construct string table from given strings.
+    char *p;
+    set<string>::const_iterator it;
+    size_t s;
+
+    // count required size for data
+    for (length = sizeof(unsigned), it = strings.begin(); it != strings.end();
+         ++it) {
+      size_t l = (*it).size();
+
+      if (l > (unsigned)0xFFFFFFFF)
+        stop("StringTable: String too long");
+      if (l > 8) {
+        length += l + 1;
+        if (length > (unsigned)0xFFFFFFFF)
+          stop("StringTable: Symbol table too long");
+      }
+    }
+    data = new char[length];
+    *(unsigned *)data = length;
+    // populate data and directory
+    for (p = data + sizeof(unsigned), it = strings.begin(); it != strings.end();
+         ++it) {
+      const string &str = *it;
+      size_t l = str.size();
+      if (l > 8) {
+        directory.insert(make_pair(str, p - data));
+        KMP_MEMCPY(p, str.c_str(), l);
+        p[l] = 0;
+        p += l + 1;
+      }
+    }
+  }
+  ~StringTable() { delete[] data; }
+  // Returns encoding for given string based on this string table. Error if
+  // string length is greater than 8 but string is not in the string table
+  // -- returns 0.
+  __int64 encode(const string &str) {
+    __int64 r;
+
+    if (str.size() <= 8) {
+      // encoded directly
+      ((char *)&r)[7] = 0;
+      KMP_STRNCPY_S((char *)&r, sizeof(r), str.c_str(), 8);
+      return r;
+    } else {
+      // represented as index into table
+      map<string, unsigned>::const_iterator it = directory.find(str);
+      if (it == directory.end())
+        stop("StringTable::encode: String now found in string table");
+      ((unsigned *)&r)[0] = 0;
+      ((unsigned *)&r)[1] = (*it).second;
+      return r;
+    }
+  }
+  // Returns string represented by x based on this string table. Error if x
+  // references an invalid position in the table--returns the empty string.
+  string decode(__int64 x) const {
+    if (*(unsigned *)&x == 0) {
+      // represented as index into table
+      unsigned &p = ((unsigned *)&x)[1];
+      if (p >= length)
+        stop("StringTable::decode: Invalid string table lookup");
+      return string(data + p);
+    } else {
+      // encoded directly
+      char *p = (char *)&x;
+      int i;
+
+      for (i = 0; i < 8 && p[i]; ++i)
+        ;
+      return string(p, i);
+    }
+  }
+  void write(ostream &os) { os.write(data, length); }
+};
+
+// for the named object file, determines the set of defined symbols and the set
+// of undefined external symbols and writes them to <defined> and <undefined>
+// respectively
+void computeExternalSymbols(const char *fileName, set<string> *defined,
+                            set<string> *undefined) {
+  streampos fileSize;
+  size_t strTabStart;
+  unsigned symTabStart, symNEntries;
+  rstream f(fileName);
+
+  f.seekg(0, ios::end);
+  fileSize = f.tellg();
+
+  f.seekg(8);
+  f >> symTabStart >> symNEntries;
+  // seek to the string table
+  f.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
+  if (f.eof()) {
+    printf("computeExternalSymbols: fileName='%s', fileSize = %lu, symTabStart "
+           "= %u, symNEntries = %u\n",
+           fileName, (unsigned long)fileSize, symTabStart, symNEntries);
+    stop("computeExternalSymbols: Unexpected EOF 1");
+  }
+  StringTable stringTable(f); // read the string table
+  if (f.tellg() != fileSize)
+    stop("computeExternalSymbols: Unexpected data after string table");
+
+  f.clear();
+  f.seekg(symTabStart); // seek to the symbol table
+
+  defined->clear();
+  undefined->clear();
+  for (int i = 0; i < symNEntries; ++i) {
+    // process each entry
+    Symbol e;
+
+    if (f.eof())
+      stop("computeExternalSymbols: Unexpected EOF 2");
+    f >> e;
+    if (f.fail())
+      stop("computeExternalSymbols: File read error");
+    if (e.nAux) { // auxiliary entry: skip
+      f.seekg(e.nAux * 18, ios::cur);
+      i += e.nAux;
+    }
+    // if symbol is extern and defined in the current file, insert it
+    if (e.storageClass == 2)
+      if (e.sectionNum)
+        defined->insert(stringTable.decode(e.name));
+      else
+        undefined->insert(stringTable.decode(e.name));
+  }
+}
+
+// For each occurrence of an external symbol in the object file named by
+// by <fileName> that is a member of <hide>, renames it by prefixing
+// with "__kmp_external_", writing back the file in-place
+void hideSymbols(char *fileName, const set<string> &hide) {
+  static const string prefix("__kmp_external_");
+  set<string> strings; // set of all occurring symbols, appropriately prefixed
+  streampos fileSize;
+  size_t strTabStart;
+  unsigned symTabStart, symNEntries;
+  int i;
+  rstream in(fileName);
+
+  in.seekg(0, ios::end);
+  fileSize = in.tellg();
+
+  in.seekg(8);
+  in >> symTabStart >> symNEntries;
+  in.seekg(strTabStart = symTabStart + 18 * (size_t)symNEntries);
+  if (in.eof())
+    stop("hideSymbols: Unexpected EOF");
+  StringTable stringTableOld(in); // read original string table
+
+  if (in.tellg() != fileSize)
+    stop("hideSymbols: Unexpected data after string table");
+
+  // compute set of occurring strings with prefix added
+  for (i = 0; i < symNEntries; ++i) {
+    Symbol e;
+
+    in.seekg(symTabStart + i * 18);
+    if (in.eof())
+      stop("hideSymbols: Unexpected EOF");
+    in >> e;
+    if (in.fail())
+      stop("hideSymbols: File read error");
+    if (e.nAux)
+      i += e.nAux;
+    const string &s = stringTableOld.decode(e.name);
+    // if symbol is extern and found in <hide>, prefix and insert into strings,
+    // otherwise, just insert into strings without prefix
+    strings.insert(
+        (e.storageClass == 2 && hide.find(s) != hide.end()) ? prefix + s : s);
+  }
+
+  ofstream out(fileName, ios::trunc | ios::out | ios::binary);
+  if (!out.is_open())
+    stop("hideSymbols: Error opening output file");
+
+  // make new string table from string set
+  StringTable stringTableNew = StringTable(strings);
+
+  // copy input file to output file up to just before the symbol table
+  in.seekg(0);
+  char *buf = new char[symTabStart];
+  in.read(buf, symTabStart);
+  out.write(buf, symTabStart);
+  delete[] buf;
+
+  // copy input symbol table to output symbol table with name translation
+  for (i = 0; i < symNEntries; ++i) {
+    Symbol e;
+
+    in.seekg(symTabStart + i * 18);
+    if (in.eof())
+      stop("hideSymbols: Unexpected EOF");
+    in >> e;
+    if (in.fail())
+      stop("hideSymbols: File read error");
+    const string &s = stringTableOld.decode(e.name);
+    out.seekp(symTabStart + i * 18);
+    e.name = stringTableNew.encode(
+        (e.storageClass == 2 && hide.find(s) != hide.end()) ? prefix + s : s);
+    out.write((char *)&e, 18);
+    if (out.fail())
+      stop("hideSymbols: File write error");
+    if (e.nAux) {
+      // copy auxiliary symbol table entries
+      int nAux = e.nAux;
+      for (int j = 1; j <= nAux; ++j) {
+        in >> e;
+        out.seekp(symTabStart + (i + j) * 18);
+        out.write((char *)&e, 18);
+      }
+      i += nAux;
+    }
+  }
+  // output string table
+  stringTableNew.write(out);
+}
+
+// returns true iff <a> and <b> have no common element
+template <class T> bool isDisjoint(const set<T> &a, const set<T> &b) {
+  set<T>::const_iterator ita, itb;
+
+  for (ita = a.begin(), itb = b.begin(); ita != a.end() && itb != b.end();) {
+    const T &ta = *ita, &tb = *itb;
+    if (ta < tb)
+      ++ita;
+    else if (tb < ta)
+      ++itb;
+    else
+      return false;
+  }
+  return true;
+}
+
+// PRE: <defined> and <undefined> are arrays with <nTotal> elements where
+// <nTotal> >= <nExternal>.  The first <nExternal> elements correspond to the
+// external object files and the rest correspond to the internal object files.
+// POST: file x is said to depend on file y if undefined[x] and defined[y] are
+// not disjoint. Returns the transitive closure of the set of internal object
+// files, as a set of file indexes, under the 'depends on' relation, minus the
+// set of internal object files.
+set<int> *findRequiredExternal(int nExternal, int nTotal, set<string> *defined,
+                               set<string> *undefined) {
+  set<int> *required = new set<int>;
+  set<int> fresh[2];
+  int i, cur = 0;
+  bool changed;
+
+  for (i = nTotal - 1; i >= nExternal; --i)
+    fresh[cur].insert(i);
+  do {
+    changed = false;
+    for (set<int>::iterator it = fresh[cur].begin(); it != fresh[cur].end();
+         ++it) {
+      set<string> &s = undefined[*it];
+
+      for (i = 0; i < nExternal; ++i) {
+        if (required->find(i) == required->end()) {
+          if (!isDisjoint(defined[i], s)) {
+            // found a new qualifying element
+            required->insert(i);
+            fresh[1 - cur].insert(i);
+            changed = true;
+          }
+        }
+      }
+    }
+    fresh[cur].clear();
+    cur = 1 - cur;
+  } while (changed);
+  return required;
+}
+
+int main(int argc, char **argv) {
+  int nExternal, nInternal, i;
+  set<string> *defined, *undefined;
+  set<int>::iterator it;
+
+  if (argc < 3)
+    stop("Please specify a positive integer followed by a list of object "
+         "filenames");
+  nExternal = atoi(argv[1]);
+  if (nExternal <= 0)
+    stop("Please specify a positive integer followed by a list of object "
+         "filenames");
+  if (nExternal + 2 > argc)
+    stop("Too few external objects");
+  nInternal = argc - nExternal - 2;
+  defined = new set<string>[argc - 2];
+  undefined = new set<string>[argc - 2];
+
+  // determine the set of defined and undefined external symbols
+  for (i = 2; i < argc; ++i)
+    computeExternalSymbols(argv[i], defined + i - 2, undefined + i - 2);
+
+  // determine the set of required external files
+  set<int> *requiredExternal =
+      findRequiredExternal(nExternal, argc - 2, defined, undefined);
+  set<string> hide;
+
+  // determine the set of symbols to hide--namely defined external symbols of
+  // the required external files
+  for (it = requiredExternal->begin(); it != requiredExternal->end(); ++it) {
+    int idx = *it;
+    set<string>::iterator it2;
+    // We have to insert one element at a time instead of inserting a range
+    // because the insert member function taking a range doesn't exist on
+    // Windows* OS, at least at the time of this writing.
+    for (it2 = defined[idx].begin(); it2 != defined[idx].end(); ++it2)
+      hide.insert(*it2);
+  }
+
+  // process the external files--removing those that are not required and hiding
+  //   the appropriate symbols in the others
+  for (i = 0; i < nExternal; ++i)
+    if (requiredExternal->find(i) != requiredExternal->end())
+      hideSymbols(argv[2 + i], hide);
+    else
+      remove(argv[2 + i]);
+  // hide the appropriate symbols in the internal files
+  for (i = nExternal + 2; i < argc; ++i)
+    hideSymbols(argv[i], hide);
+  return 0;
+}
diff --git a/final/runtime/src/i18n/en_US.txt b/final/runtime/src/i18n/en_US.txt
new file mode 100644
index 0000000..822f73c
--- /dev/null
+++ b/final/runtime/src/i18n/en_US.txt
@@ -0,0 +1,491 @@
+# en_US.txt #
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Default messages, embedded into the OpenMP RTL, and source for English catalog.
+
+
+# Compatible changes (which does not require version bumping):
+#     * Editing message (number and type of placeholders must remain, relative order of
+#       placeholders may be changed, e.g. "File %1$s line %2$d" may be safely edited to
+#       "Line %2$d file %1$s").
+#     * Adding new message to the end of section.
+# Incompatible changes (version must be bumbed by 1):
+#     * Introducing new placeholders to existing messages.
+#     * Changing type of placeholders (e.g. "line %1$d" -> "line %1$s").
+#     * Rearranging order of messages.
+#     * Deleting messages.
+# Use special "OBSOLETE" pseudoidentifier for obsolete entries, which is kept only for backward
+# compatibility. When version is bumped, do not forget to delete all obsolete entries.
+
+
+# --------------------------------------------------------------------------------------------------
+-*- META -*-
+# --------------------------------------------------------------------------------------------------
+
+# Meta information about message catalog.
+
+Language "English"
+Country  "USA"
+LangId   "1033"
+Version  "2"
+Revision "20170523"
+
+
+
+# --------------------------------------------------------------------------------------------------
+-*- STRINGS -*-
+# --------------------------------------------------------------------------------------------------
+
+# Strings are not complete messages, just fragments. We need to work on it and reduce number of
+# strings (to zero?).
+
+Error                        "Error"
+UnknownFile                  "(unknown file)"
+NotANumber                   "not a number"
+BadUnit                      "bad unit"
+IllegalCharacters            "illegal characters"
+ValueTooLarge                "value too large"
+ValueTooSmall                "value too small"
+NotMultiple4K                "value is not a multiple of 4k"
+UnknownTopology              "Unknown processor topology"
+CantOpenCpuinfo              "Cannot open /proc/cpuinfo"
+ProcCpuinfo                  "/proc/cpuinfo"
+NoProcRecords                "cpuinfo file invalid (No processor records)"
+TooManyProcRecords           "cpuinfo file invalid (Too many processor records)"
+CantRewindCpuinfo            "Cannot rewind cpuinfo file"
+LongLineCpuinfo              "cpuinfo file invalid (long line)"
+TooManyEntries               "cpuinfo file contains too many entries"
+MissingProcField             "cpuinfo file missing processor field"
+MissingPhysicalIDField       "cpuinfo file missing physical id field"
+MissingValCpuinfo            "cpuinfo file invalid (missing val)"
+DuplicateFieldCpuinfo        "cpuinfo file invalid (duplicate field)"
+PhysicalIDsNotUnique         "Physical node/pkg/core/thread ids not unique"
+ApicNotPresent               "APIC not present"
+InvalidCpuidInfo             "Invalid cpuid info"
+OBSOLETE                     "APIC ids not unique"
+InconsistentCpuidInfo        "Inconsistent cpuid info"
+OutOfHeapMemory              "Out of heap memory"
+MemoryAllocFailed            "Memory allocation failed"
+Core                         "core"
+Thread                       "thread"
+Package                      "package"
+Node                         "node"
+OBSOLETE                     "<undef>"
+DecodingLegacyAPIC           "decoding legacy APIC ids"
+OBSOLETE                     "parsing /proc/cpuinfo"
+NotDefined                   "value is not defined"
+EffectiveSettings            "Effective settings:"
+UserSettings                 "User settings:"
+StorageMapWarning            "warning: pointers or size don't make sense"
+OBSOLETE                     "CPU"
+OBSOLETE                     "TPU"
+OBSOLETE                     "TPUs per package"
+OBSOLETE                     "HT enabled"
+OBSOLETE                     "HT disabled"
+Decodingx2APIC               "decoding x2APIC ids"
+NoLeaf11Support              "cpuid leaf 11 not supported"
+NoLeaf4Support               "cpuid leaf 4 not supported"
+ThreadIDsNotUnique           "thread ids not unique"
+UsingPthread                 "using pthread info"
+LegacyApicIDsNotUnique       "legacy APIC ids not unique"
+x2ApicIDsNotUnique           "x2APIC ids not unique"
+DisplayEnvBegin		     "OPENMP DISPLAY ENVIRONMENT BEGIN"
+DisplayEnvEnd		     "OPENMP DISPLAY ENVIRONMENT END"
+Device			     "[device]"
+Host			     "[host]"
+Tile                         "tile"
+
+
+
+# --------------------------------------------------------------------------------------------------
+-*- FORMATS -*-
+# --------------------------------------------------------------------------------------------------
+
+Info                         "OMP: Info #%1$d: %2$s\n"
+Warning                      "OMP: Warning #%1$d: %2$s\n"
+Fatal                        "OMP: Error #%1$d: %2$s\n"
+SysErr                       "OMP: System error #%1$d: %2$s\n"
+Hint                         "OMP: Hint %1$s\n"
+
+Pragma                       "%1$s pragma (at %2$s:%3$s():%4$s)"
+    # %1 is pragma name (like "parallel" or "master",
+    # %2 is file name,
+    # %3 is function (routine) name,
+    # %4 is the line number (as string, so "s" type specifier should be used).
+
+
+
+# --------------------------------------------------------------------------------------------------
+-*- MESSAGES -*-
+# --------------------------------------------------------------------------------------------------
+
+# Messages of any severity: informational, warning, or fatal.
+# To maintain message numbers (they are visible to customers), add new messages to the end.
+
+# Use following prefixes for messages and hints when appropriate:
+#    Aff -- Affinity messages.
+#    Cns -- Consistency check failures (KMP_CONSISTENCY_CHECK).
+#    Itt -- ITT Notify-related messages.
+
+LibraryIsSerial              "Library is \"serial\"."
+CantOpenMessageCatalog       "Cannot open message catalog \"%1$s\":"
+WillUseDefaultMessages       "Default messages will be used."
+LockIsUninitialized          "%1$s: Lock is uninitialized"
+LockSimpleUsedAsNestable     "%1$s: Lock was initialized as simple, but used as nestable"
+LockNestableUsedAsSimple     "%1$s: Lock was initialized as nestable, but used as simple"
+LockIsAlreadyOwned           "%1$s: Lock is already owned by requesting thread"
+LockStillOwned               "%1$s: Lock is still owned by a thread"
+LockUnsettingFree            "%1$s: Attempt to release a lock not owned by any thread"
+LockUnsettingSetByAnother    "%1$s: Attempt to release a lock owned by another thread"
+StackOverflow                "Stack overflow detected for OpenMP thread #%1$d"
+StackOverlap                 "Stack overlap detected. "
+AssertionFailure             "Assertion failure at %1$s(%2$d)."
+CantRegisterNewThread        "Unable to register a new user thread."
+DuplicateLibrary             "Initializing %1$s, but found %2$s already initialized."
+CantOpenFileForReading       "Cannot open file \"%1$s\" for reading:"
+CantGetEnvVar                "Getting environment variable \"%1$s\" failed:"
+CantSetEnvVar                "Setting environment variable \"%1$s\" failed:"
+CantGetEnvironment           "Getting environment failed:"
+BadBoolValue                 "%1$s=\"%2$s\": Wrong value, boolean expected."
+SSPNotBuiltIn                "No Helper Thread support built in this OMP library."
+SPPSotfTerminateFailed       "Helper thread failed to soft terminate."
+BufferOverflow               "Buffer overflow detected."
+RealTimeSchedNotSupported    "Real-time scheduling policy is not supported."
+RunningAtMaxPriority         "OMP application is running at maximum priority with real-time scheduling policy. "
+CantChangeMonitorPriority    "Changing priority of the monitor thread failed:"
+MonitorWillStarve            "Deadlocks are highly possible due to monitor thread starvation."
+CantSetMonitorStackSize      "Unable to set monitor thread stack size to %1$lu bytes:"
+CantSetWorkerStackSize       "Unable to set OMP thread stack size to %1$lu bytes:"
+CantInitThreadAttrs          "Thread attribute initialization failed:"
+CantDestroyThreadAttrs       "Thread attribute destroying failed:"
+CantSetWorkerState           "OMP thread joinable state setting failed:"
+CantSetMonitorState          "Monitor thread joinable state setting failed:"
+NoResourcesForWorkerThread   "System unable to allocate necessary resources for OMP thread:"
+NoResourcesForMonitorThread  "System unable to allocate necessary resources for the monitor thread:"
+CantTerminateWorkerThread    "Unable to terminate OMP thread:"
+ScheduleKindOutOfRange       "Wrong schedule type %1$d, see <omp.h> or <omp_lib.h> file for the list of values supported."
+UnknownSchedulingType        "Unknown scheduling type \"%1$d\"."
+InvalidValue                 "%1$s value \"%2$s\" is invalid."
+SmallValue                   "%1$s value \"%2$s\" is too small."
+LargeValue                   "%1$s value \"%2$s\" is too large."
+StgInvalidValue              "%1$s: \"%2$s\" is an invalid value; ignored."
+BarrReleaseValueInvalid      "%1$s release value \"%2$s\" is invalid."
+BarrGatherValueInvalid       "%1$s gather value \"%2$s\" is invalid."
+OBSOLETE                     "%1$s supported only on debug builds; ignored."
+ParRangeSyntax               "Syntax error: Usage: %1$s=[ routine=<func> | filename=<file> | range=<lb>:<ub> "
+                             "| excl_range=<lb>:<ub> ],..."
+UnbalancedQuotes             "Unbalanced quotes in %1$s."
+EmptyString                  "Empty string specified for %1$s; ignored."
+LongValue                    "%1$s value is too long; ignored."
+InvalidClause                "%1$s: Invalid clause in \"%2$s\"."
+EmptyClause                  "Empty clause in %1$s."
+InvalidChunk                 "%1$s value \"%2$s\" is invalid chunk size."
+LargeChunk                   "%1$s value \"%2$s\" is to large chunk size."
+IgnoreChunk                  "%1$s value \"%2$s\" is ignored."
+CantGetProcFreq              "Cannot get processor frequency, using zero KMP_ITT_PREPARE_DELAY."
+EnvParallelWarn              "%1$s must be set prior to first parallel region; ignored."
+AffParamDefined              "%1$s: parameter has been specified already, ignoring \"%2$s\"."
+AffInvalidParam              "%1$s: parameter invalid, ignoring \"%2$s\"."
+AffManyParams                "%1$s: too many integer parameters specified, ignoring \"%2$s\"."
+AffManyParamsForLogic        "%1$s: too many integer parameters specified for logical or physical type, ignoring \"%2$d\"."
+AffNoParam                   "%1$s: '%2$s' type does not take any integer parameters, ignoring them."
+AffNoProcList                "%1$s: proclist not specified with explicit affinity type, using \"none\"."
+AffProcListNoType            "%1$s: proclist specified, setting affinity type to \"explicit\"."
+AffProcListNotExplicit       "%1$s: proclist specified without \"explicit\" affinity type, proclist ignored."
+AffSyntaxError               "%1$s: syntax error, not using affinity."
+AffZeroStride                "%1$s: range error (zero stride), not using affinity."
+AffStartGreaterEnd           "%1$s: range error (%2$d > %3$d), not using affinity."
+AffStrideLessZero            "%1$s: range error (%2$d < %3$d & stride < 0), not using affinity."
+AffRangeTooBig               "%1$s: range error ((%2$d-%3$d)/%4$d too big), not using affinity."
+OBSOLETE                     "%1$s: %2$s is defined. %3$s will be ignored."
+AffNotSupported              "%1$s: affinity not supported, using \"disabled\"."
+OBSOLETE                     "%1$s: affinity only supported for Intel(R) Architecture Processors."
+GetAffSysCallNotSupported    "%1$s: getaffinity system call not supported."
+SetAffSysCallNotSupported    "%1$s: setaffinity system call not supported."
+OBSOLETE                     "%1$s: pthread_aff_set_np call not found."
+OBSOLETE                     "%1$s: pthread_get_num_resources_np call not found."
+OBSOLETE                     "%1$s: the OS kernel does not support affinity."
+OBSOLETE                     "%1$s: pthread_get_num_resources_np returned %2$d."
+AffCantGetMaskSize           "%1$s: cannot determine proper affinity mask size."
+ParseSizeIntWarn             "%1$s=\"%2$s\": %3$s."
+ParseExtraCharsWarn          "%1$s: extra trailing characters ignored: \"%2$s\"."
+UnknownForceReduction        "%1$s: unknown method \"%2$s\"."
+TimerUseGettimeofday         "KMP_STATS_TIMER: clock_gettime is undefined, using gettimeofday."
+TimerNeedMoreParam           "KMP_STATS_TIMER: \"%1$s\" needs additional parameter, e.g. 'clock_gettime,2'. Using gettimeofday."
+TimerInvalidParam            "KMP_STATS_TIMER: clock_gettime parameter \"%1$s\" is invalid, using gettimeofday."
+TimerGettimeFailed           "KMP_STATS_TIMER: clock_gettime failed, using gettimeofday."
+TimerUnknownFunction         "KMP_STATS_TIMER: clock function unknown (ignoring value \"%1$s\")."
+UnknownSchedTypeDetected     "Unknown scheduling type detected."
+DispatchManyThreads          "Too many threads to use analytical guided scheduling - switching to iterative guided scheduling."
+IttLookupFailed              "ittnotify: Lookup of \"%1$s\" function in \"%2$s\" library failed."
+IttLoadLibFailed             "ittnotify: Loading \"%1$s\" library failed."
+IttAllNotifDisabled          "ittnotify: All itt notifications disabled."
+IttObjNotifDisabled          "ittnotify: Object state itt notifications disabled."
+IttMarkNotifDisabled         "ittnotify: Mark itt notifications disabled."
+IttUnloadLibFailed           "ittnotify: Unloading \"%1$s\" library failed."
+CantFormThrTeam              "Cannot form a team with %1$d threads, using %2$d instead."
+ActiveLevelsNegative         "Requested number of active parallel levels \"%1$d\" is negative; ignored."
+ActiveLevelsExceedLimit      "Requested number of active parallel levels \"%1$d\" exceeds supported limit; "
+                             "the following limit value will be used: \"%1$d\"."
+SetLibraryIncorrectCall      "kmp_set_library must only be called from the top level serial thread; ignored."
+FatalSysError                "Fatal system error detected."
+OutOfHeapMemory              "Out of heap memory."
+OBSOLETE                     "Clearing __KMP_REGISTERED_LIB env var failed."
+OBSOLETE                     "Registering library with env var failed."
+Using_int_Value              "%1$s value \"%2$d\" will be used."
+Using_uint_Value             "%1$s value \"%2$u\" will be used."
+Using_uint64_Value           "%1$s value \"%2$s\" will be used."
+Using_str_Value              "%1$s value \"%2$s\" will be used."
+MaxValueUsing                "%1$s maximum value \"%2$d\" will be used."
+MinValueUsing                "%1$s minimum value \"%2$d\" will be used."
+MemoryAllocFailed            "Memory allocation failed."
+FileNameTooLong              "File name too long."
+OBSOLETE                     "Lock table overflow."
+ManyThreadsForTPDirective    "Too many threads to use threadprivate directive."
+AffinityInvalidMask          "%1$s: invalid mask."
+WrongDefinition              "Wrong definition."
+TLSSetValueFailed            "Windows* OS: TLS Set Value failed."
+TLSOutOfIndexes              "Windows* OS: TLS out of indexes."
+OBSOLETE                     "PDONE directive must be nested within a DO directive."
+CantGetNumAvailCPU           "Cannot get number of available CPUs."
+AssumedNumCPU                "Assumed number of CPUs is 2."
+ErrorInitializeAffinity      "Error initializing affinity - not using affinity."
+AffThreadsMayMigrate         "Threads may migrate across all available OS procs (granularity setting too coarse)."
+AffIgnoreInvalidProcID       "Ignoring invalid OS proc ID %1$d."
+AffNoValidProcID             "No valid OS proc IDs specified - not using affinity."
+UsingFlatOS                  "%1$s - using \"flat\" OS <-> physical proc mapping."
+UsingFlatOSFile              "%1$s: %2$s - using \"flat\" OS <-> physical proc mapping."
+UsingFlatOSFileLine          "%1$s, line %2$d: %3$s - using \"flat\" OS <-> physical proc mapping."
+FileMsgExiting               "%1$s: %2$s - exiting."
+FileLineMsgExiting           "%1$s, line %2$d: %3$s - exiting."
+ConstructIdentInvalid        "Construct identifier invalid."
+ThreadIdentInvalid           "Thread identifier invalid."
+RTLNotInitialized            "runtime library not initialized."
+TPCommonBlocksInconsist      "Inconsistent THREADPRIVATE common block declarations are non-conforming "
+                             "and are unsupported. Either all threadprivate common blocks must be declared "
+                             "identically, or the largest instance of each threadprivate common block "
+                             "must be referenced first during the run."
+CantSetThreadAffMask         "Cannot set thread affinity mask."
+CantSetThreadPriority        "Cannot set thread priority."
+CantCreateThread             "Cannot create thread."
+CantCreateEvent              "Cannot create event."
+CantSetEvent                 "Cannot set event."
+CantCloseHandle              "Cannot close handle."
+UnknownLibraryType           "Unknown library type: %1$d."
+ReapMonitorError             "Monitor did not reap properly."
+ReapWorkerError              "Worker thread failed to join."
+ChangeThreadAffMaskError     "Cannot change thread affinity mask."
+ThreadsMigrate               "%1$s: Threads may migrate across %2$d innermost levels of machine"
+DecreaseToThreads            "%1$s: decrease to %2$d threads"
+IncreaseToThreads            "%1$s: increase to %2$d threads"
+OBSOLETE                     "%1$s: Internal thread %2$d bound to OS proc set %3$s"
+AffCapableUseCpuinfo         "%1$s: Affinity capable, using cpuinfo file"
+AffUseGlobCpuid              "%1$s: Affinity capable, using global cpuid info"
+AffCapableUseFlat            "%1$s: Affinity capable, using default \"flat\" topology"
+AffNotCapableUseLocCpuid     "%1$s: Affinity not capable, using local cpuid info"
+AffNotCapableUseCpuinfo      "%1$s: Affinity not capable, using cpuinfo file"
+AffFlatTopology              "%1$s: Affinity not capable, assumming \"flat\" topology"
+InitOSProcSetRespect         "%1$s: Initial OS proc set respected: %2$s"
+InitOSProcSetNotRespect      "%1$s: Initial OS proc set not respected: %2$s"
+AvailableOSProc              "%1$s: %2$d available OS procs"
+Uniform                      "%1$s: Uniform topology"
+NonUniform                   "%1$s: Nonuniform topology"
+Topology                     "%1$s: %2$d packages x %3$d cores/pkg x %4$d threads/core (%5$d total cores)"
+OBSOLETE                     "%1$s: OS proc to physical thread map ([] => level not in map):"
+OSProcToPackage              "%1$s: OS proc <n> maps to <n>th package core 0"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d [core %4$d] [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] [core %4$d] [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] [core %4$d] thread %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] core %4$d [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d [core %4$d] [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to [package %3$d] core %4$d thread %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d core %4$d [thread %5$d]"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d [core %4$d] thread %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d core %4$d thread %5$d"
+OSProcMapToPack              "%1$s: OS proc %2$d maps to %3$s"
+OBSOLETE                     "%1$s: Internal thread %2$d changed affinity mask from %3$s to %4$s"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d, CPU %4$d, TPU %5$d"
+OBSOLETE                     "%1$s: OS proc %2$d maps to package %3$d, CPU %4$d"
+OBSOLETE                     "%1$s: HT enabled; %2$d packages; %3$d TPU; %4$d TPUs per package"
+OBSOLETE                     "%1$s: HT disabled; %2$d packages"
+BarriersInDifferentOrder     "Threads encountered barriers in different order. "
+FunctionError                "Function %1$s failed:"
+TopologyExtra                "%1$s: %2$s packages x %3$d cores/pkg x %4$d threads/core (%5$d total cores)"
+WrongMessageCatalog          "Incompatible message catalog \"%1$s\": Version \"%2$s\" found, version \"%3$s\" expected."
+StgIgnored                   "%1$s: ignored because %2$s has been defined"
+                                 # %1, -- name of ignored variable, %2 -- name of variable with higher priority.
+OBSOLETE                     "%1$s: overrides %3$s specified before"
+                                 # %1, %2 -- name and value of the overriding variable, %3 -- name of overriden variable.
+AffTilesNoHWLOC              "%1$s: Tiles are only supported if KMP_TOPOLOGY_METHOD=hwloc, using granularity=package instead"
+AffTilesNoTiles              "%1$s: Tiles requested but were not detected on this HW, using granularity=package instead"
+TopologyExtraTile            "%1$s: %2$d packages x %3$d tiles/pkg x %4$d cores/tile x %5$d threads/core (%6$d total cores)"
+TopologyExtraNode            "%1$s: %2$d packages x %3$d nodes/pkg x %4$d cores/node x %5$d threads/core (%6$d total cores)"
+TopologyExtraNoTi            "%1$s: %2$d packages x %3$d nodes/pkg x %4$d tiles/node x %5$d cores/tile x %6$d threads/core (%7$d total cores)"
+OmptOutdatedWorkshare        "OMPT: Cannot determine workshare type; using the default (loop) instead. "
+                             "This issue is fixed in an up-to-date compiler."
+OmpNoAllocator               "Allocator %1$s is not available, will use default allocator."
+
+# --- OpenMP errors detected at runtime ---
+#
+#    %1 is the name of OpenMP construct (formatted with "Pragma" format).
+#
+CnsBoundToWorksharing        "%1$s must be bound to a work-sharing or work-queuing construct with an \"ordered\" clause"
+CnsDetectedEnd               "Detected end of %1$s without first executing a corresponding beginning."
+CnsIterationRangeTooLarge    "Iteration range too large in %1$s."
+CnsLoopIncrZeroProhibited    "%1$s must not have a loop increment that evaluates to zero."
+#
+#    %1 is the name of the first OpenMP construct, %2 -- the name of the second one (both formatted with "Pragma" format).
+#
+CnsExpectedEnd               "Expected end of %1$s; %2$s, however, has most recently begun execution."
+CnsInvalidNesting            "%1$s is incorrectly nested within %2$s"
+CnsMultipleNesting           "%1$s cannot be executed multiple times during execution of one parallel iteration/section of %2$s"
+CnsNestingSameName           "%1$s is incorrectly nested within %2$s of the same name"
+CnsNoOrderedClause           "%1$s is incorrectly nested within %2$s that does not have an \"ordered\" clause"
+CnsNotInTaskConstruct        "%1$s is incorrectly nested within %2$s but not within any of its \"task\" constructs"
+CnsThreadsAtBarrier          "One thread at %1$s while another thread is at %2$s."
+
+# New errors
+CantConnect                  "Cannot connect to %1$s"
+CantConnectUsing             "Cannot connect to %1$s - Using %2$s"
+LibNotSupport                "%1$s does not support %2$s. Continuing without using %2$s."
+LibNotSupportFor             "%1$s does not support %2$s for %3$s. Continuing without using %2$s."
+StaticLibNotSupport          "Static %1$s does not support %2$s. Continuing without using %2$s."
+OBSOLETE                     "KMP_DYNAMIC_MODE=irml cannot be used with KMP_USE_IRML=0"
+IttUnknownGroup              "ittnotify: Unknown group \"%2$s\" specified in environment variable \"%1$s\"."
+IttEnvVarTooLong             "ittnotify: Environment variable \"%1$s\" too long: Actual lengths is %2$lu, max allowed length is %3$lu."
+AffUseGlobCpuidL11           "%1$s: Affinity capable, using global cpuid leaf 11 info"
+AffNotCapableUseLocCpuidL11  "%1$s: Affinity not capable, using local cpuid leaf 11 info"
+AffInfoStr                   "%1$s: %2$s."
+AffInfoStrStr                "%1$s: %2$s - %3$s."
+OSProcToPhysicalThreadMap    "%1$s: OS proc to physical thread map:"
+AffUsingFlatOS               "%1$s: using \"flat\" OS <-> physical proc mapping."
+AffParseFilename             "%1$s: parsing %2$s."
+MsgExiting                   "%1$s - exiting."
+IncompatibleLibrary          "Incompatible %1$s library with version %2$s found."
+IttFunctionError             "ittnotify: Function %1$s failed:"
+IttUnknownError              "ittnofify: Error #%1$d."
+EnvMiddleWarn                "%1$s must be set prior to first parallel region or certain API calls; ignored."
+CnsLockNotDestroyed          "Lock initialized at %1$s(%2$d) was not destroyed"
+                                 # %1, %2, %3, %4 -- file, line, func, col
+CantLoadBalUsing             "Cannot determine machine load balance - Using %1$s"
+AffNotCapableUsePthread      "%1$s: Affinity not capable, using pthread info"
+AffUsePthread                "%1$s: Affinity capable, using pthread info"
+OBSOLETE                     "Loading \"%1$s\" library failed:"
+OBSOLETE                     "Lookup of \"%1$s\" function failed:"
+OBSOLETE                     "Buffer too small."
+OBSOLETE                     "Error #%1$d."
+NthSyntaxError               "%1$s: Invalid symbols found. Check the value \"%2$s\"."
+NthSpacesNotAllowed          "%1$s: Spaces between digits are not allowed \"%2$s\"."
+AffStrParseFilename          "%1$s: %2$s - parsing %3$s."
+OBSOLETE                     "%1$s cannot be specified via kmp_set_defaults() on this machine because it has more than one processor group."
+AffTypeCantUseMultGroups     "Cannot use affinity type \"%1$s\" with multiple Windows* OS processor groups, using \"%2$s\"."
+AffGranCantUseMultGroups     "Cannot use affinity granularity \"%1$s\" with multiple Windows* OS processor groups, using \"%2$s\"."
+AffWindowsProcGroupMap       "%1$s: Mapping Windows* OS processor group <i> proc <j> to OS proc 64*<i>+<j>."
+AffOSProcToGroup             "%1$s: OS proc %2$d maps to Windows* OS processor group %3$d proc %4$d"
+AffBalancedNotAvail          "%1$s: Affinity balanced is not available."
+OBSOLETE                     "%1$s: granularity=core will be used."
+EnvLockWarn                  "%1$s must be set prior to first OMP lock call or critical section; ignored."
+FutexNotSupported            "futex system call not supported; %1$s=%2$s ignored."
+AffGranUsing                 "%1$s: granularity=%2$s will be used."
+AffHWSubsetInvalid           "%1$s: invalid value \"%2$s\", valid format is \"N<item>[@N][,...][,Nt] "
+                             "(<item> can be S, N, L2, C, T  for Socket, NUMA Node, L2 Cache, Core, Thread)\"."
+AffHWSubsetUnsupported       "KMP_HW_SUBSET ignored: unsupported architecture."
+AffHWSubsetManyCores         "KMP_HW_SUBSET ignored: too many cores requested."
+SyntaxErrorUsing             "%1$s: syntax error, using %2$s."
+AdaptiveNotSupported         "%1$s: Adaptive locks are not supported; using queuing."
+EnvSyntaxError               "%1$s: Invalid symbols found. Check the value \"%2$s\"."
+EnvSpacesNotAllowed          "%1$s: Spaces between digits are not allowed \"%2$s\"."
+BoundToOSProcSet             "%1$s: pid %2$d tid %3$d thread %4$d bound to OS proc set %5$s"
+CnsLoopIncrIllegal           "%1$s error: parallel loop increment and condition are inconsistent."
+NoGompCancellation           "libgomp cancellation is not currently supported."
+AffHWSubsetNonUniform        "KMP_HW_SUBSET ignored: non-uniform topology."
+AffHWSubsetNonThreeLevel     "KMP_HW_SUBSET ignored: only three-level topology is supported."
+AffGranTopGroup              "%1$s: granularity=%2$s is not supported with KMP_TOPOLOGY_METHOD=group. Using \"granularity=fine\"."
+AffGranGroupType             "%1$s: granularity=group is not supported with KMP_AFFINITY=%2$s. Using \"granularity=core\"."
+AffHWSubsetManySockets       "KMP_HW_SUBSET ignored: too many sockets requested."
+AffHWSubsetDeprecated        "KMP_HW_SUBSET \"o\" offset designator deprecated, please use @ prefix for offset value."
+AffUsingHwloc                "%1$s: Affinity capable, using hwloc."
+AffIgnoringHwloc             "%1$s: Ignoring hwloc mechanism."
+AffHwlocErrorOccurred        "%1$s: Hwloc failed in %2$s. Relying on internal affinity mechanisms."
+EnvSerialWarn                "%1$s must be set prior to OpenMP runtime library initialization; ignored."
+EnvVarDeprecated             "%1$s variable deprecated, please use %2$s instead."
+RedMethodNotSupported        "KMP_FORCE_REDUCTION: %1$s method is not supported; using critical."
+AffHWSubsetNoHWLOC           "KMP_HW_SUBSET ignored: unsupported item requested for non-HWLOC topology method (KMP_TOPOLOGY_METHOD)"
+AffHWSubsetManyNodes         "KMP_HW_SUBSET ignored: too many NUMA Nodes requested."
+AffHWSubsetManyTiles         "KMP_HW_SUBSET ignored: too many L2 Caches requested."
+AffHWSubsetManyProcs         "KMP_HW_SUBSET ignored: too many Procs requested."
+HierSchedInvalid             "Hierarchy ignored: unsupported level: %1$s."
+AffFormatDefault             "OMP: pid %1$s tid %2$s thread %3$s bound to OS proc set {%4$s}"
+APIDeprecated                "%1$s routine deprecated, please use %2$s instead."
+
+# --------------------------------------------------------------------------------------------------
+-*- HINTS -*-
+# --------------------------------------------------------------------------------------------------
+
+# Hints. Hint may be printed after a message. Usually it is longer explanation text or suggestion.
+# To maintain hint numbers (they are visible to customers), add new hints to the end.
+
+SubmitBugReport              "Please submit a bug report with this message, compile and run "
+                             "commands used, and machine configuration info including native "
+                             "compiler and operating system versions. Faster response will be "
+                             "obtained by including all program sources. For information on "
+                             "submitting this issue, please see "
+                             "https://bugs.llvm.org/."
+OBSOLETE                     "Check NLSPATH environment variable, its value is \"%1$s\"."
+ChangeStackLimit             "Please try changing the shell stack limit or adjusting the "
+                             "OMP_STACKSIZE environment variable."
+Unset_ALL_THREADS            "Consider unsetting KMP_DEVICE_THREAD_LIMIT (KMP_ALL_THREADS), KMP_TEAMS_THREAD_LIMIT, and OMP_THREAD_LIMIT (if any are set)."
+Set_ALL_THREADPRIVATE        "Consider setting KMP_ALL_THREADPRIVATE to a value larger than %1$d."
+PossibleSystemLimitOnThreads "This could also be due to a system-related limit on the number of threads."
+DuplicateLibrary             "This means that multiple copies of the OpenMP runtime have been "
+                             "linked into the program. That is dangerous, since it can degrade "
+                             "performance or cause incorrect results. "
+                             "The best thing to do is to ensure that only a single OpenMP runtime is "
+                             "linked into the process, e.g. by avoiding static linking of the OpenMP "
+                             "runtime in any library. As an unsafe, unsupported, undocumented workaround "
+                             "you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow "
+                             "the program to continue to execute, but that may cause crashes or "
+                             "silently produce incorrect results. "
+                             "For more information, please see http://openmp.llvm.org/"
+NameComesFrom_CPUINFO_FILE   "This name is specified in environment variable KMP_CPUINFO_FILE."
+NotEnoughMemory              "Seems application required too much memory."
+ValidBoolValues              "Use \"0\", \"FALSE\". \".F.\", \"off\", \"no\" as false values, "
+                             "\"1\", \"TRUE\", \".T.\", \"on\", \"yes\" as true values."
+BufferOverflow               "Perhaps too many threads."
+RunningAtMaxPriority         "Decrease priority of application. "
+                             "This will allow the monitor thread run at higher priority than other threads."
+ChangeMonitorStackSize       "Try changing KMP_MONITOR_STACKSIZE or the shell stack limit."
+ChangeWorkerStackSize        "Try changing OMP_STACKSIZE and/or the shell stack limit."
+IncreaseWorkerStackSize      "Try increasing OMP_STACKSIZE or the shell stack limit."
+DecreaseWorkerStackSize      "Try decreasing OMP_STACKSIZE."
+Decrease_NUM_THREADS         "Try decreasing the value of OMP_NUM_THREADS."
+IncreaseMonitorStackSize     "Try increasing KMP_MONITOR_STACKSIZE."
+DecreaseMonitorStackSize     "Try decreasing KMP_MONITOR_STACKSIZE."
+DecreaseNumberOfThreadsInUse "Try decreasing the number of threads in use simultaneously."
+DefaultScheduleKindUsed      "Will use default schedule type (%1$s)."
+GetNewerLibrary              "It could be a result of using an older OMP library with a newer "
+                             "compiler or memory corruption. You may check the proper OMP library "
+                             "is linked to the application."
+CheckEnvVar                  "Check %1$s environment variable, its value is \"%2$s\"."
+OBSOLETE                     "You may want to use an %1$s library that supports %2$s interface with version %3$s."
+OBSOLETE                     "You may want to use an %1$s library with version %2$s."
+BadExeFormat                 "System error #193 is \"Bad format of EXE or DLL file\". "
+                             "Usually it means the file is found, but it is corrupted or "
+                             "a file for another architecture. "
+                             "Check whether \"%1$s\" is a file for %2$s architecture."
+SystemLimitOnThreads         "System-related limit on the number of threads."
+
+
+
+# --------------------------------------------------------------------------------------------------
+# end of file #
+# --------------------------------------------------------------------------------------------------
diff --git a/final/runtime/src/include/omp-tools.h.var b/final/runtime/src/include/omp-tools.h.var
new file mode 100644
index 0000000..190b538
--- /dev/null
+++ b/final/runtime/src/include/omp-tools.h.var
@@ -0,0 +1,1082 @@
+/*
+ * include/omp-tools.h.var
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <stddef.h>
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_task_memory)            \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+    macro (ompt_finalize_tool)              \
+                                            \
+    macro(ompt_get_num_procs)               \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
+
+#define FOREACH_OMPT_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (ompt_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (ompt_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (ompt_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (ompt_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (ompt_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (ompt_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (ompt_state_wait_mutex, 0x040)                                                         \
+    macro (ompt_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (ompt_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (ompt_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (ompt_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (ompt_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (ompt_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (ompt_state_idle, 0x100)           /* waiting for work */                              \
+    macro (ompt_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+
+
+#define FOREACH_KMP_MUTEX_IMPL(macro)                                                \
+    macro (kmp_mutex_impl_none, 0)         /* unknown implementation */              \
+    macro (kmp_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+                                                                                                                         \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
+    macro (ompt_callback_thread_end,        ompt_callback_thread_end_t,         2) /* thread end                      */ \
+                                                                                                                         \
+    macro (ompt_callback_parallel_begin,    ompt_callback_parallel_begin_t,     3) /* parallel begin                  */ \
+    macro (ompt_callback_parallel_end,      ompt_callback_parallel_end_t,       4) /* parallel end                    */ \
+                                                                                                                         \
+    macro (ompt_callback_task_create,       ompt_callback_task_create_t,        5) /* task begin                      */ \
+    macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
+    macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
+                                                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+                                                                                                                         \
+    macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
+                                                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */ \
+                                                                                                                         \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_released,    ompt_callback_mutex_t,             17) /* mutex released                  */ \
+                                                                                                                         \
+    macro (ompt_callback_dependences,       ompt_callback_dependences_t,       18) /* report task dependences         */ \
+    macro (ompt_callback_task_dependence,   ompt_callback_task_dependence_t,   19) /* report task dependence          */ \
+                                                                                                                         \
+    macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
+                                                                                                                         \
+    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+                                                                                                                         \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
+                                                                                                                         \
+    macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
+                                                                                                                         \
+    macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
+    macro (ompt_callback_lock_destroy,      ompt_callback_mutex_t,             25) /* lock destroy                    */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_acquire,     ompt_callback_mutex_acquire_t,     26) /* mutex acquire                   */ \
+    macro (ompt_callback_mutex_acquired,    ompt_callback_mutex_t,             27) /* mutex acquired                  */ \
+                                                                                                                         \
+    macro (ompt_callback_nest_lock,         ompt_callback_nest_lock_t,         28) /* nest lock                       */ \
+                                                                                                                         \
+    macro (ompt_callback_flush,             ompt_callback_flush_t,             29) /* after executing flush           */ \
+                                                                                                                         \
+    macro (ompt_callback_cancel,            ompt_callback_cancel_t,            30) /* cancel innermost binding region */ \
+                                                                                                                         \
+    macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
+                                                                                                                         \
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */
+
+/*****************************************************************************
+ * implementation specific types
+ *****************************************************************************/
+
+typedef enum kmp_mutex_impl_t {
+#define kmp_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+} kmp_mutex_impl_t;
+
+/*****************************************************************************
+ * definitions generated from spec
+ *****************************************************************************/
+
+typedef enum ompt_callbacks_t {
+  ompt_callback_thread_begin             = 1,
+  ompt_callback_thread_end               = 2,
+  ompt_callback_parallel_begin           = 3,
+  ompt_callback_parallel_end             = 4,
+  ompt_callback_task_create              = 5,
+  ompt_callback_task_schedule            = 6,
+  ompt_callback_implicit_task            = 7,
+  ompt_callback_target                   = 8,
+  ompt_callback_target_data_op           = 9,
+  ompt_callback_target_submit            = 10,
+  ompt_callback_control_tool             = 11,
+  ompt_callback_device_initialize        = 12,
+  ompt_callback_device_finalize          = 13,
+  ompt_callback_device_load              = 14,
+  ompt_callback_device_unload            = 15,
+  ompt_callback_sync_region_wait         = 16,
+  ompt_callback_mutex_released           = 17,
+  ompt_callback_dependences              = 18,
+  ompt_callback_task_dependence          = 19,
+  ompt_callback_work                     = 20,
+  ompt_callback_master                   = 21,
+  ompt_callback_target_map               = 22,
+  ompt_callback_sync_region              = 23,
+  ompt_callback_lock_init                = 24,
+  ompt_callback_lock_destroy             = 25,
+  ompt_callback_mutex_acquire            = 26,
+  ompt_callback_mutex_acquired           = 27,
+  ompt_callback_nest_lock                = 28,
+  ompt_callback_flush                    = 29,
+  ompt_callback_cancel                   = 30,
+  ompt_callback_reduction                = 31,
+  ompt_callback_dispatch                 = 32
+} ompt_callbacks_t;
+
+typedef enum ompt_record_t {
+  ompt_record_ompt               = 1,
+  ompt_record_native             = 2,
+  ompt_record_invalid            = 3
+} ompt_record_t;
+
+typedef enum ompt_record_native_t {
+  ompt_record_native_info  = 1,
+  ompt_record_native_event = 2
+} ompt_record_native_t;
+
+typedef enum ompt_set_result_t {
+  ompt_set_error            = 0,
+  ompt_set_never            = 1,
+  ompt_set_impossible       = 2,
+  ompt_set_sometimes        = 3,
+  ompt_set_sometimes_paired = 4,
+  ompt_set_always           = 5
+} ompt_set_result_t;
+
+typedef uint64_t ompt_id_t;
+
+typedef uint64_t ompt_device_time_t;
+
+typedef uint64_t ompt_buffer_cursor_t;
+
+typedef enum ompt_thread_t {
+  ompt_thread_initial                 = 1,
+  ompt_thread_worker                  = 2,
+  ompt_thread_other                   = 3,
+  ompt_thread_unknown                 = 4
+} ompt_thread_t;
+
+typedef enum ompt_scope_endpoint_t {
+  ompt_scope_begin                    = 1,
+  ompt_scope_end                      = 2
+} ompt_scope_endpoint_t;
+
+typedef enum ompt_dispatch_t {
+  ompt_dispatch_iteration             = 1,
+  ompt_dispatch_section               = 2
+} ompt_dispatch_t;
+
+typedef enum ompt_sync_region_t {
+  ompt_sync_region_barrier                = 1,
+  ompt_sync_region_barrier_implicit       = 2,
+  ompt_sync_region_barrier_explicit       = 3,
+  ompt_sync_region_barrier_implementation = 4,
+  ompt_sync_region_taskwait               = 5,
+  ompt_sync_region_taskgroup              = 6,
+  ompt_sync_region_reduction              = 7
+} ompt_sync_region_t;
+
+typedef enum ompt_target_data_op_t {
+  ompt_target_data_alloc                = 1,
+  ompt_target_data_transfer_to_device   = 2,
+  ompt_target_data_transfer_from_device = 3,
+  ompt_target_data_delete               = 4,
+  ompt_target_data_associate            = 5,
+  ompt_target_data_disassociate         = 6
+} ompt_target_data_op_t;
+
+typedef enum ompt_work_t {
+  ompt_work_loop               = 1,
+  ompt_work_sections           = 2,
+  ompt_work_single_executor    = 3,
+  ompt_work_single_other       = 4,
+  ompt_work_workshare          = 5,
+  ompt_work_distribute         = 6,
+  ompt_work_taskloop           = 7
+} ompt_work_t;
+
+typedef enum ompt_mutex_t {
+  ompt_mutex_lock                     = 1,
+  ompt_mutex_test_lock                = 2,
+  ompt_mutex_nest_lock                = 3,
+  ompt_mutex_test_nest_lock           = 4,
+  ompt_mutex_critical                 = 5,
+  ompt_mutex_atomic                   = 6,
+  ompt_mutex_ordered                  = 7
+} ompt_mutex_t;
+
+typedef enum ompt_native_mon_flag_t {
+  ompt_native_data_motion_explicit    = 0x01,
+  ompt_native_data_motion_implicit    = 0x02,
+  ompt_native_kernel_invocation       = 0x04,
+  ompt_native_kernel_execution        = 0x08,
+  ompt_native_driver                  = 0x10,
+  ompt_native_runtime                 = 0x20,
+  ompt_native_overhead                = 0x40,
+  ompt_native_idleness                = 0x80
+} ompt_native_mon_flag_t;
+
+typedef enum ompt_task_flag_t {
+  ompt_task_initial                   = 0x00000001,
+  ompt_task_implicit                  = 0x00000002,
+  ompt_task_explicit                  = 0x00000004,
+  ompt_task_target                    = 0x00000008,
+  ompt_task_undeferred                = 0x08000000,
+  ompt_task_untied                    = 0x10000000,
+  ompt_task_final                     = 0x20000000,
+  ompt_task_mergeable                 = 0x40000000,
+  ompt_task_merged                    = 0x80000000
+} ompt_task_flag_t;
+
+typedef enum ompt_task_status_t {
+  ompt_task_complete      = 1,
+  ompt_task_yield         = 2,
+  ompt_task_cancel        = 3,
+  ompt_task_detach        = 4,
+  ompt_task_early_fulfill = 5,
+  ompt_task_late_fulfill  = 6,
+  ompt_task_switch        = 7
+} ompt_task_status_t;
+
+typedef enum ompt_target_t {
+  ompt_target                         = 1,
+  ompt_target_enter_data              = 2,
+  ompt_target_exit_data               = 3,
+  ompt_target_update                  = 4
+} ompt_target_t;
+
+typedef enum ompt_parallel_flag_t {
+  ompt_parallel_invoker_program = 0x00000001,
+  ompt_parallel_invoker_runtime = 0x00000002,
+  ompt_parallel_league          = 0x40000000,
+  ompt_parallel_team            = 0x80000000
+} ompt_parallel_flag_t;
+
+typedef enum ompt_target_map_flag_t {
+  ompt_target_map_flag_to             = 0x01,
+  ompt_target_map_flag_from           = 0x02,
+  ompt_target_map_flag_alloc          = 0x04,
+  ompt_target_map_flag_release        = 0x08,
+  ompt_target_map_flag_delete         = 0x10,
+  ompt_target_map_flag_implicit       = 0x20
+} ompt_target_map_flag_t;
+
+typedef enum ompt_dependence_type_t {
+  ompt_dependence_type_in              = 1,
+  ompt_dependence_type_out             = 2,
+  ompt_dependence_type_inout           = 3,
+  ompt_dependence_type_mutexinoutset   = 4,
+  ompt_dependence_type_source          = 5,
+  ompt_dependence_type_sink            = 6
+} ompt_dependence_type_t;
+
+typedef enum ompt_cancel_flag_t {
+  ompt_cancel_parallel       = 0x01,
+  ompt_cancel_sections       = 0x02,
+  ompt_cancel_loop           = 0x04,
+  ompt_cancel_taskgroup      = 0x08,
+  ompt_cancel_activated      = 0x10,
+  ompt_cancel_detected       = 0x20,
+  ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+
+typedef uint64_t ompt_hwid_t;
+
+typedef uint64_t ompt_wait_id_t;
+
+typedef enum ompt_frame_flag_t {
+  ompt_frame_runtime        = 0x00,
+  ompt_frame_application    = 0x01,
+  ompt_frame_cfa            = 0x10,
+  ompt_frame_framepointer   = 0x20,
+  ompt_frame_stackaddress   = 0x30
+} ompt_frame_flag_t;
+
+typedef enum ompt_state_t {
+  ompt_state_work_serial                      = 0x000,
+  ompt_state_work_parallel                    = 0x001,
+  ompt_state_work_reduction                   = 0x002,
+
+  ompt_state_wait_barrier                     = 0x010,
+  ompt_state_wait_barrier_implicit_parallel   = 0x011,
+  ompt_state_wait_barrier_implicit_workshare  = 0x012,
+  ompt_state_wait_barrier_implicit            = 0x013,
+  ompt_state_wait_barrier_explicit            = 0x014,
+
+  ompt_state_wait_taskwait                    = 0x020,
+  ompt_state_wait_taskgroup                   = 0x021,
+
+  ompt_state_wait_mutex                       = 0x040,
+  ompt_state_wait_lock                        = 0x041,
+  ompt_state_wait_critical                    = 0x042,
+  ompt_state_wait_atomic                      = 0x043,
+  ompt_state_wait_ordered                     = 0x044,
+
+  ompt_state_wait_target                      = 0x080,
+  ompt_state_wait_target_map                  = 0x081,
+  ompt_state_wait_target_update               = 0x082,
+
+  ompt_state_idle                             = 0x100,
+  ompt_state_overhead                         = 0x101,
+  ompt_state_undefined                        = 0x102
+} ompt_state_t;
+
+typedef uint64_t (*ompt_get_unique_id_t) (void);
+
+typedef uint64_t ompd_size_t;
+
+typedef uint64_t ompd_wait_id_t;
+
+typedef uint64_t ompd_addr_t;
+typedef int64_t  ompd_word_t;
+typedef uint64_t ompd_seg_t;
+
+typedef uint64_t ompd_device_t;
+
+typedef uint64_t ompd_thread_id_t;
+
+typedef enum ompd_scope_t {
+  ompd_scope_global = 1,
+  ompd_scope_address_space = 2,
+  ompd_scope_thread = 3,
+  ompd_scope_parallel = 4,
+  ompd_scope_implicit_task = 5,
+  ompd_scope_task = 6
+} ompd_scope_t;
+
+typedef uint64_t ompd_icv_id_t;
+
+typedef enum ompd_rc_t {
+  ompd_rc_ok = 0,
+  ompd_rc_unavailable = 1,
+  ompd_rc_stale_handle = 2,
+  ompd_rc_bad_input = 3,
+  ompd_rc_error = 4,
+  ompd_rc_unsupported = 5,
+  ompd_rc_needs_state_tracking = 6,
+  ompd_rc_incompatible = 7,
+  ompd_rc_device_read_error = 8,
+  ompd_rc_device_write_error = 9,
+  ompd_rc_nomem = 10,
+} ompd_rc_t;
+
+typedef void (*ompt_interface_fn_t) (void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
+  const char *interface_function_name
+);
+
+typedef union ompt_data_t {
+  uint64_t value;
+  void *ptr;
+} ompt_data_t;
+
+typedef struct ompt_frame_t {
+  ompt_data_t exit_frame;
+  ompt_data_t enter_frame;
+  int exit_frame_flags;
+  int enter_frame_flags;
+} ompt_frame_t;
+
+typedef void (*ompt_callback_t) (void);
+
+typedef void ompt_device_t;
+
+typedef void ompt_buffer_t;
+
+typedef void (*ompt_callback_buffer_request_t) (
+  int device_num,
+  ompt_buffer_t **buffer,
+  size_t *bytes
+);
+
+typedef void (*ompt_callback_buffer_complete_t) (
+  int device_num,
+  ompt_buffer_t *buffer,
+  size_t bytes,
+  ompt_buffer_cursor_t begin,
+  int buffer_owned
+);
+
+typedef void (*ompt_finalize_t) (
+  ompt_data_t *tool_data
+);
+
+typedef int (*ompt_initialize_t) (
+  ompt_function_lookup_t lookup,
+  int initial_device_num,
+  ompt_data_t *tool_data
+);
+
+typedef struct ompt_start_tool_result_t {
+  ompt_initialize_t initialize;
+  ompt_finalize_t finalize;
+  ompt_data_t tool_data;
+} ompt_start_tool_result_t;
+
+typedef struct ompt_record_abstract_t {
+  ompt_record_native_t rclass;
+  const char *type;
+  ompt_device_time_t start_time;
+  ompt_device_time_t end_time;
+  ompt_hwid_t hwid;
+} ompt_record_abstract_t;
+
+typedef struct ompt_dependence_t {
+  ompt_data_t variable;
+  ompt_dependence_type_t dependence_type;
+} ompt_dependence_t;
+
+typedef int (*ompt_enumerate_states_t) (
+  int current_state,
+  int *next_state,
+  const char **next_state_name
+);
+
+typedef int (*ompt_enumerate_mutex_impls_t) (
+  int current_impl,
+  int *next_impl,
+  const char **next_impl_name
+);
+
+typedef ompt_set_result_t (*ompt_set_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t callback
+);
+
+typedef int (*ompt_get_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t *callback
+);
+
+typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
+
+typedef int (*ompt_get_num_procs_t) (void);
+
+typedef int (*ompt_get_num_places_t) (void);
+
+typedef int (*ompt_get_place_proc_ids_t) (
+  int place_num,
+  int ids_size,
+  int *ids
+);
+
+typedef int (*ompt_get_place_num_t) (void);
+
+typedef int (*ompt_get_partition_place_nums_t) (
+  int place_nums_size,
+  int *place_nums
+);
+
+typedef int (*ompt_get_proc_id_t) (void);
+
+typedef int (*ompt_get_state_t) (
+  ompt_wait_id_t *wait_id
+);
+
+typedef int (*ompt_get_parallel_info_t) (
+  int ancestor_level,
+  ompt_data_t **parallel_data,
+  int *team_size
+);
+
+typedef int (*ompt_get_task_info_t) (
+  int ancestor_level,
+  int *flags,
+  ompt_data_t **task_data,
+  ompt_frame_t **task_frame,
+  ompt_data_t **parallel_data,
+  int *thread_num
+);
+
+typedef int (*ompt_get_task_memory_t)(
+  void **addr,
+  size_t *size,
+  int block
+);
+
+typedef int (*ompt_get_target_info_t) (
+  uint64_t *device_num,
+  ompt_id_t *target_id,
+  ompt_id_t *host_op_id
+);
+
+typedef int (*ompt_get_num_devices_t) (void);
+
+typedef void (*ompt_finalize_tool_t) (void);
+
+typedef int (*ompt_get_device_num_procs_t) (
+  ompt_device_t *device
+);
+
+typedef ompt_device_time_t (*ompt_get_device_time_t) (
+  ompt_device_t *device
+);
+
+typedef double (*ompt_translate_time_t) (
+  ompt_device_t *device,
+  ompt_device_time_t time
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
+  ompt_device_t *device,
+  unsigned int enable,
+  unsigned int etype
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_native_t) (
+  ompt_device_t *device,
+  int enable,
+  int flags
+);
+
+typedef int (*ompt_start_trace_t) (
+  ompt_device_t *device,
+  ompt_callback_buffer_request_t request,
+  ompt_callback_buffer_complete_t complete
+);
+
+typedef int (*ompt_pause_trace_t) (
+  ompt_device_t *device,
+  int begin_pause
+);
+
+typedef int (*ompt_flush_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_stop_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_advance_buffer_cursor_t) (
+  ompt_device_t *device,
+  ompt_buffer_t *buffer,
+  size_t size,
+  ompt_buffer_cursor_t current,
+  ompt_buffer_cursor_t *next
+);
+
+typedef ompt_record_t (*ompt_get_record_type_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+typedef void *(*ompt_get_record_native_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current,
+  ompt_id_t *host_op_id
+);
+
+typedef ompt_record_abstract_t *
+(*ompt_get_record_abstract_t) (
+  void *native_record
+);
+
+typedef void (*ompt_callback_thread_begin_t) (
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data
+);
+
+typedef struct ompt_record_thread_begin_t {
+  ompt_thread_t thread_type;
+} ompt_record_thread_begin_t;
+
+typedef void (*ompt_callback_thread_end_t) (
+  ompt_data_t *thread_data
+);
+
+typedef void (*ompt_callback_parallel_begin_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *parallel_data,
+  unsigned int requested_parallelism,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_begin_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t parallel_id;
+  unsigned int requested_parallelism;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_begin_t;
+
+typedef void (*ompt_callback_parallel_end_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *encountering_task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_end_t {
+  ompt_id_t parallel_id;
+  ompt_id_t encountering_task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_end_t;
+
+typedef void (*ompt_callback_work_t) (
+  ompt_work_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_work_t {
+  ompt_work_t wstype;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  uint64_t count;
+  const void *codeptr_ra;
+} ompt_record_work_t;
+
+typedef void (*ompt_callback_dispatch_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  ompt_dispatch_t kind,
+  ompt_data_t instance
+);
+
+typedef struct ompt_record_dispatch_t {
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  ompt_dispatch_t kind;
+  ompt_data_t instance;
+} ompt_record_dispatch_t;
+
+typedef void (*ompt_callback_task_create_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *new_task_data,
+  int flags,
+  int has_dependences,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_task_create_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t new_task_id;
+  int flags;
+  int has_dependences;
+  const void *codeptr_ra;
+} ompt_record_task_create_t;
+
+typedef void (*ompt_callback_dependences_t) (
+  ompt_data_t *task_data,
+  const ompt_dependence_t *deps,
+  int ndeps
+);
+
+typedef struct ompt_record_dependences_t {
+  ompt_id_t task_id;
+  ompt_dependence_t dep;
+  int ndeps;
+} ompt_record_dependences_t;
+
+typedef void (*ompt_callback_task_dependence_t) (
+  ompt_data_t *src_task_data,
+  ompt_data_t *sink_task_data
+);
+
+typedef struct ompt_record_task_dependence_t {
+  ompt_id_t src_task_id;
+  ompt_id_t sink_task_id;
+} ompt_record_task_dependence_t;
+
+typedef void (*ompt_callback_task_schedule_t) (
+  ompt_data_t *prior_task_data,
+  ompt_task_status_t prior_task_status,
+  ompt_data_t *next_task_data
+);
+
+typedef struct ompt_record_task_schedule_t {
+  ompt_id_t prior_task_id;
+  ompt_task_status_t prior_task_status;
+  ompt_id_t next_task_id;
+} ompt_record_task_schedule_t;
+
+typedef void (*ompt_callback_implicit_task_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  unsigned int actual_parallelism,
+  unsigned int index,
+  int flags
+);
+
+typedef struct ompt_record_implicit_task_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  unsigned int actual_parallelism;
+  unsigned int index;
+  int flags;
+} ompt_record_implicit_task_t;
+
+typedef void (*ompt_callback_master_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_master_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_master_t;
+
+typedef void (*ompt_callback_sync_region_t) (
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_sync_region_t {
+  ompt_sync_region_t kind;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_sync_region_t;
+
+typedef void (*ompt_callback_mutex_acquire_t) (
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_acquire_t {
+  ompt_mutex_t kind;
+  unsigned int hint;
+  unsigned int impl;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_acquire_t;
+
+typedef void (*ompt_callback_mutex_t) (
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_t {
+  ompt_mutex_t kind;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_t;
+
+typedef void (*ompt_callback_nest_lock_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_nest_lock_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_nest_lock_t;
+
+typedef void (*ompt_callback_flush_t) (
+  ompt_data_t *thread_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_flush_t {
+  const void *codeptr_ra;
+} ompt_record_flush_t;
+
+typedef void (*ompt_callback_cancel_t) (
+  ompt_data_t *task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_cancel_t {
+  ompt_id_t task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_cancel_t;
+
+typedef void (*ompt_callback_device_initialize_t) (
+  int device_num,
+  const char *type,
+  ompt_device_t *device,
+  ompt_function_lookup_t lookup,
+  const char *documentation
+);
+
+typedef void (*ompt_callback_device_finalize_t) (
+  int device_num
+);
+
+typedef void (*ompt_callback_device_load_t) (
+  int device_num,
+  const char *filename,
+  int64_t offset_in_file,
+  void *vma_in_file,
+  size_t bytes,
+  void *host_addr,
+  void *device_addr,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_device_unload_t) (
+  int device_num,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_target_data_op_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_data_op_t {
+  ompt_id_t host_op_id;
+  ompt_target_data_op_t optype;
+  void *src_addr;
+  int src_device_num;
+  void *dest_addr;
+  int dest_device_num;
+  size_t bytes;
+  ompt_device_time_t end_time;
+  const void *codeptr_ra;
+} ompt_record_target_data_op_t;
+
+typedef void (*ompt_callback_target_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_id_t target_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_t {
+  ompt_target_t kind;
+  ompt_scope_endpoint_t endpoint;
+  int device_num;
+  ompt_id_t task_id;
+  ompt_id_t target_id;
+  const void *codeptr_ra;
+} ompt_record_target_t;
+
+typedef void (*ompt_callback_target_map_t) (
+  ompt_id_t target_id,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_map_t {
+  ompt_id_t target_id;
+  unsigned int nitems;
+  void **host_addr;
+  void **device_addr;
+  size_t *bytes;
+  unsigned int *mapping_flags;
+  const void *codeptr_ra;
+} ompt_record_target_map_t;
+
+typedef void (*ompt_callback_target_submit_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  unsigned int requested_num_teams
+);
+
+typedef struct ompt_record_target_kernel_t {
+  ompt_id_t host_op_id;
+  unsigned int requested_num_teams;
+  unsigned int granted_num_teams;
+  ompt_device_time_t end_time;
+} ompt_record_target_kernel_t;
+
+typedef int (*ompt_callback_control_tool_t) (
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_control_tool_t {
+  uint64_t command;
+  uint64_t modifier;
+  const void *codeptr_ra;
+} ompt_record_control_tool_t;
+
+typedef struct ompd_address_t {
+  ompd_seg_t segment;
+  ompd_addr_t address;
+} ompd_address_t;
+
+typedef struct ompd_frame_info_t {
+  ompd_address_t frame_address;
+  ompd_word_t frame_flag;
+} ompd_frame_info_t;
+
+typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
+typedef struct _ompd_thread_handle ompd_thread_handle_t;
+typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
+typedef struct _ompd_task_handle ompd_task_handle_t;
+
+typedef struct _ompd_aspace_cont ompd_address_space_context_t;
+typedef struct _ompd_thread_cont ompd_thread_context_t;
+
+typedef struct ompd_device_type_sizes_t {
+  uint8_t sizeof_char;
+  uint8_t sizeof_short;
+  uint8_t sizeof_int;
+  uint8_t sizeof_long;
+  uint8_t sizeof_long_long;
+  uint8_t sizeof_pointer;
+} ompd_device_type_sizes_t;
+
+typedef struct ompt_record_ompt_t {
+  ompt_callbacks_t type;
+  ompt_device_time_t time;
+  ompt_id_t thread_id;
+  ompt_id_t target_id;
+  union {
+    ompt_record_thread_begin_t thread_begin;
+    ompt_record_parallel_begin_t parallel_begin;
+    ompt_record_parallel_end_t parallel_end;
+    ompt_record_work_t work;
+    ompt_record_dispatch_t dispatch;
+    ompt_record_task_create_t task_create;
+    ompt_record_dependences_t dependences;
+    ompt_record_task_dependence_t task_dependence;
+    ompt_record_task_schedule_t task_schedule;
+    ompt_record_implicit_task_t implicit_task;
+    ompt_record_master_t master;
+    ompt_record_sync_region_t sync_region;
+    ompt_record_mutex_acquire_t mutex_acquire;
+    ompt_record_mutex_t mutex;
+    ompt_record_nest_lock_t nest_lock;
+    ompt_record_flush_t flush;
+    ompt_record_cancel_t cancel;
+    ompt_record_target_t target;
+    ompt_record_target_data_op_t target_data_op;
+    ompt_record_target_map_t target_map;
+    ompt_record_target_kernel_t target_kernel;
+    ompt_record_control_tool_t control_tool;
+  } record;
+} ompt_record_ompt_t;
+
+typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+#define ompt_id_none 0
+#define ompt_data_none {0}
+#define ompt_time_none 0
+#define ompt_hwid_none 0
+#define ompt_addr_none ~0
+#define ompt_mutex_impl_none 0
+#define ompt_wait_id_none 0
+
+#define ompd_segment_none 0
+
+#endif /* __OMPT__ */
diff --git a/final/runtime/src/include/omp.h.var b/final/runtime/src/include/omp.h.var
new file mode 100644
index 0000000..2246e70
--- /dev/null
+++ b/final/runtime/src/include/omp.h.var
@@ -0,0 +1,371 @@
+/*
+ * include/omp.h.var
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef __OMP_H
+#   define __OMP_H
+
+#   include <stdlib.h>
+#   include <stdint.h>
+
+#   define KMP_VERSION_MAJOR    @LIBOMP_VERSION_MAJOR@
+#   define KMP_VERSION_MINOR    @LIBOMP_VERSION_MINOR@
+#   define KMP_VERSION_BUILD    @LIBOMP_VERSION_BUILD@
+#   define KMP_BUILD_DATE       "@LIBOMP_BUILD_DATE@"
+
+#   ifdef __cplusplus
+    extern "C" {
+#   endif
+
+#   define omp_set_affinity_format   ompc_set_affinity_format
+#   define omp_get_affinity_format   ompc_get_affinity_format
+#   define omp_display_affinity      ompc_display_affinity
+#   define omp_capture_affinity      ompc_capture_affinity
+
+#   if defined(_WIN32)
+#       define __KAI_KMPC_CONVENTION __cdecl
+#       ifndef __KMP_IMP
+#           define __KMP_IMP __declspec(dllimport)
+#       endif
+#   else
+#       define __KAI_KMPC_CONVENTION
+#       ifndef __KMP_IMP
+#           define __KMP_IMP
+#       endif
+#   endif
+
+    /* schedule kind constants */
+    typedef enum omp_sched_t {
+        omp_sched_static  = 1,
+        omp_sched_dynamic = 2,
+        omp_sched_guided  = 3,
+        omp_sched_auto    = 4,
+        omp_sched_monotonic = 0x80000000
+    } omp_sched_t;
+
+    /* set API functions */
+    extern void   __KAI_KMPC_CONVENTION  omp_set_num_threads (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_dynamic     (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nested      (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_max_active_levels (int);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_schedule          (omp_sched_t, int);
+
+    /* query API functions */
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_dynamic      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_nested       (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_threads  (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_num   (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_num_procs    (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_parallel      (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_in_final         (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_active_level        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_level               (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_ancestor_thread_num (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_team_size           (int);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_thread_limit        (void);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_active_levels   (void);
+    extern void   __KAI_KMPC_CONVENTION  omp_get_schedule            (omp_sched_t *, int *);
+    extern int    __KAI_KMPC_CONVENTION  omp_get_max_task_priority   (void);
+
+    /* lock API functions */
+    typedef struct omp_lock_t {
+        void * _lk;
+    } omp_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_lock    (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_lock     (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_lock   (omp_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_lock (omp_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_lock    (omp_lock_t *);
+
+    /* nested lock API functions */
+    typedef struct omp_nest_lock_t {
+        void * _lk;
+    } omp_nest_lock_t;
+
+    extern void   __KAI_KMPC_CONVENTION  omp_init_nest_lock    (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_set_nest_lock     (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_unset_nest_lock   (omp_nest_lock_t *);
+    extern void   __KAI_KMPC_CONVENTION  omp_destroy_nest_lock (omp_nest_lock_t *);
+    extern int    __KAI_KMPC_CONVENTION  omp_test_nest_lock    (omp_nest_lock_t *);
+
+    /* OpenMP 5.0  Synchronization hints*/
+    typedef enum omp_sync_hint_t {
+        omp_sync_hint_none           = 0,
+        omp_lock_hint_none           = omp_sync_hint_none,
+        omp_sync_hint_uncontended    = 1,
+        omp_lock_hint_uncontended    = omp_sync_hint_uncontended,
+        omp_sync_hint_contended      = (1<<1),
+        omp_lock_hint_contended      = omp_sync_hint_contended,
+        omp_sync_hint_nonspeculative = (1<<2),
+        omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative,
+        omp_sync_hint_speculative    = (1<<3),
+        omp_lock_hint_speculative    = omp_sync_hint_speculative,
+        kmp_lock_hint_hle            = (1<<16),
+        kmp_lock_hint_rtm            = (1<<17),
+        kmp_lock_hint_adaptive       = (1<<18)
+    } omp_sync_hint_t;
+
+    /* lock hint type for dynamic user lock */
+    typedef omp_sync_hint_t omp_lock_hint_t;
+
+    /* hinted lock initializers */
+    extern void __KAI_KMPC_CONVENTION omp_init_lock_with_hint(omp_lock_t *, omp_lock_hint_t);
+    extern void __KAI_KMPC_CONVENTION omp_init_nest_lock_with_hint(omp_nest_lock_t *, omp_lock_hint_t);
+
+    /* time API functions */
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtime (void);
+    extern double __KAI_KMPC_CONVENTION  omp_get_wtick (void);
+
+    /* OpenMP 4.0 */
+    extern int  __KAI_KMPC_CONVENTION  omp_get_default_device (void);
+    extern void __KAI_KMPC_CONVENTION  omp_set_default_device (int);
+    extern int  __KAI_KMPC_CONVENTION  omp_is_initial_device (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_devices (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_num_teams (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_team_num (void);
+    extern int  __KAI_KMPC_CONVENTION  omp_get_cancellation (void);
+
+    /* OpenMP 4.5 */
+    extern int   __KAI_KMPC_CONVENTION  omp_get_initial_device (void);
+    extern void* __KAI_KMPC_CONVENTION  omp_target_alloc(size_t, int);
+    extern void  __KAI_KMPC_CONVENTION  omp_target_free(void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_is_present(void *, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy(void *, void *, size_t, size_t, size_t, int, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_memcpy_rect(void *, void *, size_t, int, const size_t *,
+                                            const size_t *, const size_t *, const size_t *, const size_t *, int, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_associate_ptr(void *, void *, size_t, size_t, int);
+    extern int   __KAI_KMPC_CONVENTION  omp_target_disassociate_ptr(void *, int);
+
+    /* OpenMP 5.0 */
+    extern int   __KAI_KMPC_CONVENTION  omp_get_device_num (void);
+    typedef void * omp_depend_t;
+
+    /* kmp API functions */
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_stacksize          (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize          (int);
+    extern size_t __KAI_KMPC_CONVENTION  kmp_get_stacksize_s        (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_stacksize_s        (size_t);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_blocktime          (void);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_library            (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_blocktime          (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library            (int);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_serial     (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_turnaround (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_library_throughput (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_defaults           (char const *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_disp_num_buffers   (int);
+
+    /* Intel affinity API */
+    typedef void * kmp_affinity_mask_t;
+
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity             (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_max_proc    (void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_create_affinity_mask     (kmp_affinity_mask_t *);
+    extern void   __KAI_KMPC_CONVENTION  kmp_destroy_affinity_mask    (kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_set_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_unset_affinity_mask_proc (int, kmp_affinity_mask_t *);
+    extern int    __KAI_KMPC_CONVENTION  kmp_get_affinity_mask_proc   (int, kmp_affinity_mask_t *);
+
+    /* OpenMP 4.0 affinity API */
+    typedef enum omp_proc_bind_t {
+        omp_proc_bind_false = 0,
+        omp_proc_bind_true = 1,
+        omp_proc_bind_master = 2,
+        omp_proc_bind_close = 3,
+        omp_proc_bind_spread = 4
+    } omp_proc_bind_t;
+
+    extern omp_proc_bind_t __KAI_KMPC_CONVENTION omp_get_proc_bind (void);
+
+    /* OpenMP 4.5 affinity API */
+    extern int  __KAI_KMPC_CONVENTION omp_get_num_places (void);
+    extern int  __KAI_KMPC_CONVENTION omp_get_place_num_procs (int);
+    extern void __KAI_KMPC_CONVENTION omp_get_place_proc_ids (int, int *);
+    extern int  __KAI_KMPC_CONVENTION omp_get_place_num (void);
+    extern int  __KAI_KMPC_CONVENTION omp_get_partition_num_places (void);
+    extern void __KAI_KMPC_CONVENTION omp_get_partition_place_nums (int *);
+
+    extern void * __KAI_KMPC_CONVENTION  kmp_malloc  (size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_aligned_malloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_calloc  (size_t, size_t);
+    extern void * __KAI_KMPC_CONVENTION  kmp_realloc (void *, size_t);
+    extern void   __KAI_KMPC_CONVENTION  kmp_free    (void *);
+
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_on(void);
+    extern void   __KAI_KMPC_CONVENTION  kmp_set_warnings_off(void);
+
+    /* OpenMP 5.0 Tool Control */
+    typedef enum omp_control_tool_result_t {
+        omp_control_tool_notool = -2,
+        omp_control_tool_nocallback = -1,
+        omp_control_tool_success = 0,
+        omp_control_tool_ignored = 1
+    } omp_control_tool_result_t;
+
+    typedef enum omp_control_tool_t {
+        omp_control_tool_start = 1,
+        omp_control_tool_pause = 2,
+        omp_control_tool_flush = 3,
+        omp_control_tool_end = 4
+    } omp_control_tool_t;
+
+    extern int __KAI_KMPC_CONVENTION omp_control_tool(int, int, void*);
+
+    /* OpenMP 5.0 Memory Management */
+    typedef uintptr_t omp_uintptr_t;
+
+    typedef enum {
+        OMP_ATK_THREADMODEL = 1,
+        OMP_ATK_ALIGNMENT = 2,
+        OMP_ATK_ACCESS = 3,
+        OMP_ATK_POOL_SIZE = 4,
+        OMP_ATK_FALLBACK = 5,
+        OMP_ATK_FB_DATA = 6,
+        OMP_ATK_PINNED = 7,
+        OMP_ATK_PARTITION = 8
+    } omp_alloctrait_key_t;
+
+    typedef enum {
+        OMP_ATV_FALSE = 0,
+        OMP_ATV_TRUE = 1,
+        OMP_ATV_DEFAULT = 2,
+        OMP_ATV_CONTENDED = 3,
+        OMP_ATV_UNCONTENDED = 4,
+        OMP_ATV_SEQUENTIAL = 5,
+        OMP_ATV_PRIVATE = 6,
+        OMP_ATV_ALL = 7,
+        OMP_ATV_THREAD = 8,
+        OMP_ATV_PTEAM = 9,
+        OMP_ATV_CGROUP = 10,
+        OMP_ATV_DEFAULT_MEM_FB = 11,
+        OMP_ATV_NULL_FB = 12,
+        OMP_ATV_ABORT_FB = 13,
+        OMP_ATV_ALLOCATOR_FB = 14,
+        OMP_ATV_ENVIRONMENT = 15,
+        OMP_ATV_NEAREST = 16,
+        OMP_ATV_BLOCKED = 17,
+        OMP_ATV_INTERLEAVED = 18
+    } omp_alloctrait_value_t;
+
+    typedef struct {
+        omp_alloctrait_key_t key;
+        omp_uintptr_t value;
+    } omp_alloctrait_t;
+
+#   if defined(_WIN32)
+    // On Windows cl and icl do not support 64-bit enum, let's use integer then.
+    typedef omp_uintptr_t omp_allocator_handle_t;
+    extern __KMP_IMP omp_allocator_handle_t const omp_null_allocator;
+    extern __KMP_IMP omp_allocator_handle_t const omp_default_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_large_cap_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_const_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_high_bw_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_low_lat_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_cgroup_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_pteam_mem_alloc;
+    extern __KMP_IMP omp_allocator_handle_t const omp_thread_mem_alloc;
+    typedef omp_uintptr_t omp_memspace_handle_t;
+    extern __KMP_IMP omp_memspace_handle_t const omp_default_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_large_cap_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
+    extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
+#   else
+#       if __cplusplus >= 201103
+    typedef enum omp_allocator_handle_t : omp_uintptr_t
+#       else
+    typedef enum omp_allocator_handle_t
+#       endif
+    {
+      omp_null_allocator = 0,
+      omp_default_mem_alloc = 1,
+      omp_large_cap_mem_alloc = 2,
+      omp_const_mem_alloc = 3,
+      omp_high_bw_mem_alloc = 4,
+      omp_low_lat_mem_alloc = 5,
+      omp_cgroup_mem_alloc = 6,
+      omp_pteam_mem_alloc = 7,
+      omp_thread_mem_alloc = 8,
+      KMP_ALLOCATOR_MAX_HANDLE = UINTPTR_MAX
+    } omp_allocator_handle_t;
+#       if __cplusplus >= 201103
+    typedef enum omp_memspace_handle_t : omp_uintptr_t
+#       else
+    typedef enum omp_memspace_handle_t
+#       endif
+    {
+      omp_default_mem_space = 0,
+      omp_large_cap_mem_space = 1,
+      omp_const_mem_space = 2,
+      omp_high_bw_mem_space = 3,
+      omp_low_lat_mem_space = 4,
+      KMP_MEMSPACE_MAX_HANDLE = UINTPTR_MAX
+    } omp_memspace_handle_t;
+#   endif
+    extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_init_allocator(omp_memspace_handle_t m,
+                                                       int ntraits, omp_alloctrait_t traits[]);
+    extern void __KAI_KMPC_CONVENTION omp_destroy_allocator(omp_allocator_handle_t allocator);
+
+    extern void __KAI_KMPC_CONVENTION omp_set_default_allocator(omp_allocator_handle_t a);
+    extern omp_allocator_handle_t __KAI_KMPC_CONVENTION omp_get_default_allocator(void);
+#   ifdef __cplusplus
+    extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a = omp_null_allocator);
+    extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
+#   else
+    extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
+    extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
+#   endif
+
+    /* OpenMP 5.0 Affinity Format */
+    extern void __KAI_KMPC_CONVENTION omp_set_affinity_format(char const *);
+    extern size_t __KAI_KMPC_CONVENTION omp_get_affinity_format(char *, size_t);
+    extern void __KAI_KMPC_CONVENTION omp_display_affinity(char const *);
+    extern size_t __KAI_KMPC_CONVENTION omp_capture_affinity(char *, size_t, char const *);
+
+    /* OpenMP 5.0 events */
+#   if defined(_WIN32)
+    // On Windows cl and icl do not support 64-bit enum, let's use integer then.
+    typedef omp_uintptr_t omp_event_handle_t;
+#   else
+    typedef enum omp_event_handle_t { KMP_EVENT_MAX_HANDLE = UINTPTR_MAX } omp_event_handle_t;
+#   endif
+    extern void __KAI_KMPC_CONVENTION omp_fulfill_event ( omp_event_handle_t event );
+
+    /* OpenMP 5.0 Pause Resources */
+    typedef enum omp_pause_resource_t {
+      omp_pause_resume = 0,
+      omp_pause_soft = 1,
+      omp_pause_hard = 2
+    } omp_pause_resource_t;
+    extern int __KAI_KMPC_CONVENTION omp_pause_resource(omp_pause_resource_t, int);
+    extern int __KAI_KMPC_CONVENTION omp_pause_resource_all(omp_pause_resource_t);
+
+    extern int __KAI_KMPC_CONVENTION omp_get_supported_active_levels(void);
+
+#   undef __KAI_KMPC_CONVENTION
+#   undef __KMP_IMP
+
+    /* Warning:
+       The following typedefs are not standard, deprecated and will be removed in a future release.
+    */
+    typedef int     omp_int_t;
+    typedef double  omp_wtime_t;
+
+#   ifdef __cplusplus
+    }
+#   endif
+
+#endif /* __OMP_H */
diff --git a/final/runtime/src/include/omp_lib.f.var b/final/runtime/src/include/omp_lib.f.var
new file mode 100644
index 0000000..19f14d7
--- /dev/null
+++ b/final/runtime/src/include/omp_lib.f.var
@@ -0,0 +1,1047 @@
+! include/omp_lib.f.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+!// See https://llvm.org/LICENSE.txt for license information.
+!// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+!***
+!*** Some of the directives for the following routine extend past column 72,
+!*** so process this file in 132-column mode.
+!***
+
+!dec$ fixedformlinesize:132
+
+      module omp_lib_kinds
+
+        integer, parameter :: omp_integer_kind       = 4
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = 4
+        integer, parameter :: omp_lock_kind          = int_ptr_kind()
+        integer, parameter :: omp_nest_lock_kind     = int_ptr_kind()
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = int_ptr_kind()
+        integer, parameter :: kmp_size_t_kind        = int_ptr_kind()
+        integer, parameter :: kmp_affinity_mask_kind = int_ptr_kind()
+        integer, parameter :: kmp_cancel_kind        = omp_integer_kind
+        integer, parameter :: omp_lock_hint_kind     = omp_integer_kind
+        integer, parameter :: omp_control_tool_kind  = omp_integer_kind
+        integer, parameter :: omp_control_tool_result_kind = omp_integer_kind
+        integer, parameter :: omp_allocator_handle_kind = int_ptr_kind()
+        integer, parameter :: omp_memspace_handle_kind = int_ptr_kind()
+        integer, parameter :: omp_alloctrait_key_kind = omp_integer_kind
+        integer, parameter :: omp_alloctrait_val_kind = int_ptr_kind()
+
+        type omp_alloctrait
+          integer(kind=omp_alloctrait_key_kind) key
+          integer(kind=omp_alloctrait_val_kind) value
+        end type omp_alloctrait
+
+        integer, parameter :: omp_pause_resource_kind = omp_integer_kind
+        integer, parameter :: omp_depend_kind = int_ptr_kind()
+        integer, parameter :: omp_event_handle_kind = int_ptr_kind()
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = @LIBOMP_VERSION_MAJOR@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = @LIBOMP_VERSION_MINOR@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = @LIBOMP_VERSION_BUILD@
+        character(*), parameter :: kmp_build_date    = '@LIBOMP_BUILD_DATE@'
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = @LIBOMP_OMP_YEAR_MONTH@
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+        integer(kind=omp_sched_kind), parameter :: omp_sched_monotonic = Z'80000000'
+
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
+
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_none           = 0
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_uncontended    = 1
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_contended      = 2
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_nonspeculative = 4
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_speculative    = 8
+        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_hle            = 65536
+        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_rtm            = 131072
+        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive       = 262144
+
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_threadmodel = 1
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_alignment = 2
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_access = 3
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pool_size = 4
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fallback = 5
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fb_data = 6
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pinned = 7
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_partition = 8
+
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_false = 0
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_true = 1
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default = 2
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_contended = 3
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_uncontended = 4
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_sequential = 5
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_private = 6
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_all = 7
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_thread = 8
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_pteam = 9
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_cgroup = 10
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default_mem_fb = 11
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_null_fb = 12
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_abort_fb = 13
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_allocator_fb = 14
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_environment = 15
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_nearest = 16
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_blocked = 17
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_interleaved = 18
+
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_null_allocator = 0
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_default_mem_alloc = 1
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_large_cap_mem_alloc = 2
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_const_mem_alloc = 3
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_high_bw_mem_alloc = 4
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_low_lat_mem_alloc = 5
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_cgroup_mem_alloc = 6
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_pteam_mem_alloc = 7
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_thread_mem_alloc = 8
+
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_default_mem_space = 0
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_large_cap_mem_space = 1
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_const_mem_space = 2
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_high_bw_mem_space = 3
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_low_lat_mem_space = 4
+
+        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_resume = 0
+        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_soft = 1
+        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_hard = 2
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(num_threads)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) num_threads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(dynamic_threads)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) dynamic_threads
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(nested)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) nested
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_in_final()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_final
+          end function omp_in_final
+
+          function omp_get_dynamic()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) level
+            integer (kind=omp_integer_kind) omp_get_team_size
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, chunk_size)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) chunk_size
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, chunk_size)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) chunk_size
+          end subroutine omp_get_schedule
+
+          function omp_get_proc_bind()
+            use omp_lib_kinds
+            integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+          end function omp_get_proc_bind
+
+          function omp_get_num_places()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_places
+          end function omp_get_num_places
+
+          function omp_get_place_num_procs(place_num)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) place_num
+            integer (kind=omp_integer_kind) omp_get_place_num_procs
+          end function omp_get_place_num_procs
+
+          subroutine omp_get_place_proc_ids(place_num, ids)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) place_num
+            integer (kind=omp_integer_kind) ids(*)
+          end subroutine omp_get_place_proc_ids
+
+          function omp_get_place_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_place_num
+          end function omp_get_place_num
+
+          function omp_get_partition_num_places()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_partition_num_places
+          end function omp_get_partition_num_places
+
+          subroutine omp_get_partition_place_nums(place_nums)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) place_nums(*)
+          end subroutine omp_get_partition_place_nums
+
+          function omp_get_wtime()
+            double precision omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick ()
+            double precision omp_get_wtick
+          end function omp_get_wtick
+
+          function omp_get_default_device()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_default_device
+          end function omp_get_default_device
+
+          subroutine omp_set_default_device(device_num)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) device_num
+          end subroutine omp_set_default_device
+
+          function omp_get_num_devices()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_devices
+          end function omp_get_num_devices
+
+          function omp_get_num_teams()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_teams
+          end function omp_get_num_teams
+
+          function omp_get_team_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_num
+          end function omp_get_team_num
+
+          function omp_get_cancellation()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_cancellation
+          end function omp_get_cancellation
+
+          function omp_is_initial_device()
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_is_initial_device
+          end function omp_is_initial_device
+
+          function omp_get_initial_device()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_initial_device
+          end function omp_get_initial_device
+
+          function omp_get_device_num()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_device_num
+          end function omp_get_device_num
+
+          function omp_pause_resource(kind, device_num)
+            use omp_lib_kinds
+            integer (kind=omp_pause_resource_kind) kind
+            integer (kind=omp_integer_kind) device_num
+            integer (kind=omp_integer_kind) omp_pause_resource
+          end function omp_pause_resource
+
+          function omp_pause_resource_all(kind)
+            use omp_lib_kinds
+            integer (kind=omp_pause_resource_kind) kind
+            integer (kind=omp_integer_kind) omp_pause_resource_all
+          end function omp_pause_resource_all
+
+          function omp_get_supported_active_levels()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_supported_active_levels
+          end function omp_get_supported_active_levels
+
+          subroutine omp_fulfill_event(event)
+            use omp_lib_kinds
+            integer (kind=omp_event_handle_kind) event
+          end subroutine omp_fulfill_event
+
+          subroutine omp_init_lock(svar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(svar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(svar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(svar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(svar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) svar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(nvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(nvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(nvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(nvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(nvar)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) nvar
+          end function omp_test_nest_lock
+
+          function omp_get_max_task_priority()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_task_priority
+          end function omp_get_max_task_priority
+
+          function omp_init_allocator(memspace, ntraits, traits)
+            use omp_lib_kinds
+            integer (omp_allocator_handle_kind) omp_init_allocator
+            integer (omp_memspace_handle_kind) :: memspace
+            integer (omp_integer_kind) :: ntraits
+            type(omp_alloctrait), intent(in) :: traits(*)
+          end function omp_init_allocator
+
+          subroutine omp_destroy_allocator(allocator) bind(c)
+            use omp_lib_kinds
+            integer (omp_allocator_handle_kind), value :: allocator
+          end subroutine omp_destroy_allocator
+
+          subroutine omp_set_default_allocator(allocator) bind(c)
+            use omp_lib_kinds
+            integer (omp_allocator_handle_kind) allocator
+          end subroutine omp_set_default_allocator
+
+          function omp_get_default_allocator() bind(c)
+            use omp_lib_kinds
+            integer(omp_allocator_handle_kind)omp_get_default_allocator
+          end function omp_get_default_allocator
+
+          subroutine omp_set_affinity_format(format)
+            character (len=*) format
+          end subroutine omp_set_affinity_format
+
+          function omp_get_affinity_format(buffer)
+            use omp_lib_kinds
+            character (len=*) buffer
+            integer (kind=kmp_size_t_kind) omp_get_affinity_format
+          end function omp_get_affinity_format
+
+          subroutine omp_display_affinity(format)
+            character (len=*) format
+          end subroutine omp_display_affinity
+
+          function omp_capture_affinity(buffer, format)
+            use omp_lib_kinds
+            character (len=*) format
+            character (len=*) buffer
+            integer (kind=kmp_size_t_kind) omp_capture_affinity
+          end function omp_capture_affinity
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial()
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround()
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput()
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string)
+            character*(*) string
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s()
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          subroutine kmp_set_disp_num_buffers(num)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) num
+          end subroutine kmp_set_disp_num_buffers
+
+          function kmp_set_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc()
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind) proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_malloc
+
+          function kmp_aligned_malloc(size, alignment)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_aligned_malloc
+            integer (kind=kmp_size_t_kind) size
+            integer (kind=kmp_size_t_kind) alignment
+          end function kmp_aligned_malloc
+
+          function kmp_calloc(nelem, elsize)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind) nelem
+            integer (kind=kmp_size_t_kind) elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind) ptr
+            integer (kind=kmp_size_t_kind) size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on()
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off()
+          end subroutine kmp_set_warnings_off
+
+          function kmp_get_cancellation_status(cancelkind)
+            use omp_lib_kinds
+            integer (kind=kmp_cancel_kind) cancelkind
+            logical (kind=omp_logical_kind) kmp_get_cancellation_status
+          end function kmp_get_cancellation_status
+
+          subroutine omp_init_lock_with_hint(svar, hint)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+            integer (kind=omp_lock_hint_kind) hint
+          end subroutine omp_init_lock_with_hint
+
+          subroutine omp_init_nest_lock_with_hint(nvar, hint)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+            integer (kind=omp_lock_hint_kind) hint
+          end subroutine omp_init_nest_lock_with_hint
+
+          function omp_control_tool(command, modifier)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_control_tool
+            integer (kind=omp_control_tool_kind) command
+            integer (kind=omp_control_tool_kind) modifier
+          end function omp_control_tool
+
+        end interface
+
+!dec$ if defined(_WIN32)
+!dec$   if defined(_WIN64) .or. defined(_M_AMD64)
+
+!***
+!*** The Fortran entry points must be in uppercase, even if the /Qlowercase
+!*** option is specified.  The alias attribute ensures that the specified
+!*** string is used as the entry point.
+!***
+!*** On the Windows* OS IA-32 architecture, the Fortran entry points have an
+!*** underscore prepended.  On the Windows* OS Intel(R) 64
+!*** architecture, no underscore is prepended.
+!***
+
+!dec$ attributes alias:'OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'OMP_GET_PROC_BIND' :: omp_get_proc_bind
+!dec$ attributes alias:'OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'OMP_GET_WTICK' :: omp_get_wtick
+!dec$ attributes alias:'OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
+!dec$ attributes alias:'OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
+!dec$ attributes alias:'OMP_GET_NUM_DEVICES' :: omp_get_num_devices
+!dec$ attributes alias:'OMP_GET_NUM_TEAMS' :: omp_get_num_teams
+!dec$ attributes alias:'OMP_GET_TEAM_NUM' :: omp_get_team_num
+!dec$ attributes alias:'OMP_GET_CANCELLATION' :: omp_get_cancellation
+!dec$ attributes alias:'OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
+!dec$ attributes alias:'OMP_GET_INITIAL_DEVICE' :: omp_get_initial_device
+!dec$ attributes alias:'OMP_GET_MAX_TASK_PRIORITY' :: omp_get_max_task_priority
+!dec$ attributes alias:'OMP_GET_DEVICE_NUM' :: omp_get_device_num
+!dec$ attributes alias:'OMP_PAUSE_RESOURCE' :: omp_pause_resource
+!dec$ attributes alias:'OMP_PAUSE_RESOURCE_ALL' :: omp_pause_resource_all
+!dec$ attributes alias:'OMP_GET_SUPPORTED_ACTIVE_LEVELS' :: omp_get_supported_active_levels
+!dec$ attributes alias:'OMP_FULFILL_EVENT' :: omp_fulfill_event
+
+!dec$ attributes alias:'OMP_CONTROL_TOOL' :: omp_control_tool
+!dec$ attributes alias:'OMP_SET_AFFINITY_FORMAT' :: omp_set_affinity_format
+!dec$ attributes alias:'OMP_GET_AFFINITY_FORMAT' :: omp_get_affinity_format
+!dec$ attributes alias:'OMP_DISPLAY_AFFINITY' :: omp_display_affinity
+!dec$ attributes alias:'OMP_CAPTURE_AFFINITY' :: omp_capture_affinity
+
+!dec$ attributes alias:'omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'omp_init_lock_with_hint' :: omp_init_lock_with_hint
+!dec$ attributes alias:'omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'omp_init_nest_lock_with_hint' :: omp_init_nest_lock_with_hint
+!dec$ attributes alias:'omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'KMP_ALIGNED_MALLOC'::kmp_aligned_malloc
+!dec$ attributes alias:'KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$ attributes alias:'KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
+
+!dec$   else
+
+!***
+!*** On Windows* OS IA-32 architecture, the Fortran entry points have an underscore prepended.
+!***
+
+!dec$ attributes alias:'_OMP_SET_NUM_THREADS' :: omp_set_num_threads
+!dec$ attributes alias:'_OMP_SET_DYNAMIC' :: omp_set_dynamic
+!dec$ attributes alias:'_OMP_SET_NESTED' :: omp_set_nested
+!dec$ attributes alias:'_OMP_GET_NUM_THREADS' :: omp_get_num_threads
+!dec$ attributes alias:'_OMP_GET_MAX_THREADS' :: omp_get_max_threads
+!dec$ attributes alias:'_OMP_GET_THREAD_NUM' :: omp_get_thread_num
+!dec$ attributes alias:'_OMP_GET_NUM_PROCS' :: omp_get_num_procs
+!dec$ attributes alias:'_OMP_IN_PARALLEL' :: omp_in_parallel
+!dec$ attributes alias:'_OMP_GET_DYNAMIC' :: omp_get_dynamic
+!dec$ attributes alias:'_OMP_GET_NESTED' :: omp_get_nested
+!dec$ attributes alias:'_OMP_GET_THREAD_LIMIT' :: omp_get_thread_limit
+!dec$ attributes alias:'_OMP_SET_MAX_ACTIVE_LEVELS' :: omp_set_max_active_levels
+!dec$ attributes alias:'_OMP_GET_MAX_ACTIVE_LEVELS' :: omp_get_max_active_levels
+!dec$ attributes alias:'_OMP_GET_LEVEL' :: omp_get_level
+!dec$ attributes alias:'_OMP_GET_ACTIVE_LEVEL' :: omp_get_active_level
+!dec$ attributes alias:'_OMP_GET_ANCESTOR_THREAD_NUM' :: omp_get_ancestor_thread_num
+!dec$ attributes alias:'_OMP_GET_TEAM_SIZE' :: omp_get_team_size
+!dec$ attributes alias:'_OMP_SET_SCHEDULE' :: omp_set_schedule
+!dec$ attributes alias:'_OMP_GET_SCHEDULE' :: omp_get_schedule
+!dec$ attributes alias:'_OMP_GET_PROC_BIND' :: omp_get_proc_bind
+!dec$ attributes alias:'_OMP_GET_WTIME' :: omp_get_wtime
+!dec$ attributes alias:'_OMP_GET_WTICK' :: omp_get_wtick
+!dec$ attributes alias:'_OMP_GET_DEFAULT_DEVICE' :: omp_get_default_device
+!dec$ attributes alias:'_OMP_SET_DEFAULT_DEVICE' :: omp_set_default_device
+!dec$ attributes alias:'_OMP_GET_NUM_DEVICES' :: omp_get_num_devices
+!dec$ attributes alias:'_OMP_GET_NUM_TEAMS' :: omp_get_num_teams
+!dec$ attributes alias:'_OMP_GET_TEAM_NUM' :: omp_get_team_num
+!dec$ attributes alias:'_OMP_GET_CANCELLATION' :: omp_get_cancellation
+!dec$ attributes alias:'_OMP_IS_INITIAL_DEVICE' :: omp_is_initial_device
+!dec$ attributes alias:'_OMP_GET_INITIAL_DEVICE' :: omp_get_initial_device
+!dec$ attributes alias:'_OMP_GET_MAX_TASK_PRIORTY' :: omp_get_max_task_priority
+!dec$ attributes alias:'_OMP_GET_DEVICE_NUM' :: omp_get_device_num
+!dec$ attributes alias:'_OMP_PAUSE_RESOURCE' :: omp_pause_resource
+!dec$ attributes alias:'_OMP_PAUSE_RESOURCE_ALL' :: omp_pause_resource_all
+!dec$ attributes alias:'_OMP_GET_SUPPORTED_ACTIVE_LEVELS' :: omp_get_supported_active_levels
+!dec$ attributes alias:'_OMP_FULFILL_EVENT' :: omp_fulfill_event
+
+!dec$ attributes alias:'_OMP_CONTROL_TOOL' :: omp_control_tool
+!dec$ attributes alias:'_OMP_SET_AFFINITY_FORMAT' :: omp_set_affinity_format
+!dec$ attributes alias:'_OMP_GET_AFFINITY_FORMAT' :: omp_get_affinity_format
+!dec$ attributes alias:'_OMP_DISPLAY_AFFINITY' :: omp_display_affinity
+!dec$ attributes alias:'_OMP_CAPTURE_AFFINITY' :: omp_capture_affinity
+
+!dec$ attributes alias:'_omp_init_lock' :: omp_init_lock
+!dec$ attributes alias:'_omp_init_lock_with_hint' :: omp_init_lock_with_hint
+!dec$ attributes alias:'_omp_destroy_lock' :: omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock' :: omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock' :: omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock' :: omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock' :: omp_init_nest_lock
+!dec$ attributes alias:'_omp_init_nest_lock_with_hint' :: omp_init_nest_lock_with_hint
+!dec$ attributes alias:'_omp_destroy_nest_lock' :: omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock' :: omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock' :: omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock' :: omp_test_nest_lock
+
+!dec$ attributes alias:'_KMP_SET_STACKSIZE'::kmp_set_stacksize
+!dec$ attributes alias:'_KMP_SET_STACKSIZE_S'::kmp_set_stacksize_s
+!dec$ attributes alias:'_KMP_SET_BLOCKTIME'::kmp_set_blocktime
+!dec$ attributes alias:'_KMP_SET_LIBRARY_SERIAL'::kmp_set_library_serial
+!dec$ attributes alias:'_KMP_SET_LIBRARY_TURNAROUND'::kmp_set_library_turnaround
+!dec$ attributes alias:'_KMP_SET_LIBRARY_THROUGHPUT'::kmp_set_library_throughput
+!dec$ attributes alias:'_KMP_SET_LIBRARY'::kmp_set_library
+!dec$ attributes alias:'_KMP_GET_STACKSIZE'::kmp_get_stacksize
+!dec$ attributes alias:'_KMP_GET_STACKSIZE_S'::kmp_get_stacksize_s
+!dec$ attributes alias:'_KMP_GET_BLOCKTIME'::kmp_get_blocktime
+!dec$ attributes alias:'_KMP_GET_LIBRARY'::kmp_get_library
+!dec$ attributes alias:'_KMP_SET_AFFINITY'::kmp_set_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY'::kmp_get_affinity
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MAX_PROC'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_KMP_CREATE_AFFINITY_MASK'::kmp_create_affinity_mask
+!dec$ attributes alias:'_KMP_DESTROY_AFFINITY_MASK'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_KMP_SET_AFFINITY_MASK_PROC'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_KMP_UNSET_AFFINITY_MASK_PROC'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_KMP_GET_AFFINITY_MASK_PROC'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_KMP_MALLOC'::kmp_malloc
+!dec$ attributes alias:'_KMP_ALIGNED_MALLOC'::kmp_aligned_malloc
+!dec$ attributes alias:'_KMP_CALLOC'::kmp_calloc
+!dec$ attributes alias:'_KMP_REALLOC'::kmp_realloc
+!dec$ attributes alias:'_KMP_FREE'::kmp_free
+
+!dec$ attributes alias:'_KMP_SET_WARNINGS_ON'::kmp_set_warnings_on
+!dec$ attributes alias:'_KMP_SET_WARNINGS_OFF'::kmp_set_warnings_off
+
+!dec$ attributes alias:'_KMP_GET_CANCELLATION_STATUS' :: kmp_get_cancellation_status
+
+!dec$   endif
+!dec$ endif
+
+!dec$ if defined(__linux)
+
+!***
+!*** The Linux* OS entry points are in lowercase, with an underscore appended.
+!***
+
+!dec$ attributes alias:'omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'omp_get_level_'::omp_get_level
+!dec$ attributes alias:'omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'omp_get_proc_bind_' :: omp_get_proc_bind
+!dec$ attributes alias:'omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'omp_get_wtick_'::omp_get_wtick
+!dec$ attributes alias:'omp_get_default_device_'::omp_get_default_device
+!dec$ attributes alias:'omp_set_default_device_'::omp_set_default_device
+!dec$ attributes alias:'omp_get_num_devices_'::omp_get_num_devices
+!dec$ attributes alias:'omp_get_num_teams_'::omp_get_num_teams
+!dec$ attributes alias:'omp_get_team_num_'::omp_get_team_num
+!dec$ attributes alias:'omp_get_cancellation_'::omp_get_cancellation
+!dec$ attributes alias:'omp_is_initial_device_'::omp_is_initial_device
+!dec$ attributes alias:'omp_get_initial_device_'::omp_get_initial_device
+!dec$ attributes alias:'omp_get_max_task_priority_'::omp_get_max_task_priority
+!dec$ attributes alias:'omp_get_device_num_'::omp_get_device_num
+!dec$ attributes alias:'omp_pause_resource_' :: omp_pause_resource
+!dec$ attributes alias:'omp_pause_resource_all_' :: omp_pause_resource_all
+!dec$ attributes alias:'omp_get_supported_active_levels_' :: omp_get_supported_active_levels
+!dec$ attributes alias:'omp_fulfill_event_' :: omp_fulfill_event
+
+!dec$ attributes alias:'omp_set_affinity_format_' :: omp_set_affinity_format
+!dec$ attributes alias:'omp_get_affinity_format_' :: omp_get_affinity_format
+!dec$ attributes alias:'omp_display_affinity_' :: omp_display_affinity
+!dec$ attributes alias:'omp_capture_affinity_' :: omp_capture_affinity
+
+!dec$ attributes alias:'omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'omp_init_lock_with_hint_'::omp_init_lock_with_hint
+!dec$ attributes alias:'omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'omp_init_nest_lock_with_hint_'::omp_init_nest_lock_with_hint
+!dec$ attributes alias:'omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'omp_test_nest_lock_'::omp_test_nest_lock
+!dec$ attributes alias:'omp_control_tool_'::omp_control_tool
+
+!dec$ attributes alias:'kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'kmp_aligned_malloc_'::kmp_aligned_malloc
+!dec$ attributes alias:'kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'kmp_free_'::kmp_free
+
+!dec$ attributes alias:'kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'kmp_set_warnings_off_'::kmp_set_warnings_off
+!dec$ attributes alias:'kmp_get_cancellation_status_'::kmp_get_cancellation_status
+
+!dec$ endif
+
+!dec$ if defined(__APPLE__)
+
+!***
+!*** The Mac entry points are in lowercase, with an both an underscore
+!*** appended and an underscore prepended.
+!***
+
+!dec$ attributes alias:'_omp_set_num_threads_'::omp_set_num_threads
+!dec$ attributes alias:'_omp_set_dynamic_'::omp_set_dynamic
+!dec$ attributes alias:'_omp_set_nested_'::omp_set_nested
+!dec$ attributes alias:'_omp_get_num_threads_'::omp_get_num_threads
+!dec$ attributes alias:'_omp_get_max_threads_'::omp_get_max_threads
+!dec$ attributes alias:'_omp_get_thread_num_'::omp_get_thread_num
+!dec$ attributes alias:'_omp_get_num_procs_'::omp_get_num_procs
+!dec$ attributes alias:'_omp_in_parallel_'::omp_in_parallel
+!dec$ attributes alias:'_omp_get_dynamic_'::omp_get_dynamic
+!dec$ attributes alias:'_omp_get_nested_'::omp_get_nested
+!dec$ attributes alias:'_omp_get_thread_limit_'::omp_get_thread_limit
+!dec$ attributes alias:'_omp_set_max_active_levels_'::omp_set_max_active_levels
+!dec$ attributes alias:'_omp_get_max_active_levels_'::omp_get_max_active_levels
+!dec$ attributes alias:'_omp_get_level_'::omp_get_level
+!dec$ attributes alias:'_omp_get_active_level_'::omp_get_active_level
+!dec$ attributes alias:'_omp_get_ancestor_thread_num_'::omp_get_ancestor_thread_num
+!dec$ attributes alias:'_omp_get_team_size_'::omp_get_team_size
+!dec$ attributes alias:'_omp_set_schedule_'::omp_set_schedule
+!dec$ attributes alias:'_omp_get_schedule_'::omp_get_schedule
+!dec$ attributes alias:'_omp_get_proc_bind_' :: omp_get_proc_bind
+!dec$ attributes alias:'_omp_get_wtime_'::omp_get_wtime
+!dec$ attributes alias:'_omp_get_wtick_'::omp_get_wtick
+!dec$ attributes alias:'_omp_get_default_device_'::omp_get_default_device
+!dec$ attributes alias:'_omp_set_default_device_'::omp_set_default_device
+!dec$ attributes alias:'_omp_get_num_devices_'::omp_get_num_devices
+!dec$ attributes alias:'_omp_get_num_teams_'::omp_get_num_teams
+!dec$ attributes alias:'_omp_get_team_num_'::omp_get_team_num
+!dec$ attributes alias:'_omp_get_cancellation_'::omp_get_cancellation
+!dec$ attributes alias:'_omp_is_initial_device_'::omp_is_initial_device
+!dec$ attributes alias:'_omp_get_initial_device_'::omp_get_initial_device
+!dec$ attributes alias:'_omp_get_max_task_priorty_'::omp_get_max_task_priority
+!dec$ attributes alias:'_omp_get_device_num_'::omp_get_device_num
+!dec$ attributes alias:'_omp_pause_resource_' :: omp_pause_resource
+!dec$ attributes alias:'_omp_pause_resource_all_' :: omp_pause_resource_all
+!dec$ attributes alias:'_omp_get_supported_active_levels_' :: omp_get_supported_active_levels
+!dec$ attributes alias:'_omp_fulfill_event_' :: omp_fulfill_event
+
+!dec$ attributes alias:'_omp_init_lock_'::omp_init_lock
+!dec$ attributes alias:'_omp_init_lock_with_hint_'::omp_init_lock_with_hint
+!dec$ attributes alias:'_omp_destroy_lock_'::omp_destroy_lock
+!dec$ attributes alias:'_omp_set_lock_'::omp_set_lock
+!dec$ attributes alias:'_omp_unset_lock_'::omp_unset_lock
+!dec$ attributes alias:'_omp_test_lock_'::omp_test_lock
+!dec$ attributes alias:'_omp_init_nest_lock_'::omp_init_nest_lock
+!dec$ attributes alias:'_omp_init_nest_lock_with_hint_'::omp_init_nest_lock_with_hint
+!dec$ attributes alias:'_omp_destroy_nest_lock_'::omp_destroy_nest_lock
+!dec$ attributes alias:'_omp_set_nest_lock_'::omp_set_nest_lock
+!dec$ attributes alias:'_omp_unset_nest_lock_'::omp_unset_nest_lock
+!dec$ attributes alias:'_omp_test_nest_lock_'::omp_test_nest_lock
+!dec$ attributes alias:'_omp_control_tool_'::omp_control_tool
+!dec$ attributes alias:'_omp_set_affinity_format_' :: omp_set_affinity_format
+!dec$ attributes alias:'_omp_get_affinity_format_' :: omp_get_affinity_format
+!dec$ attributes alias:'_omp_display_affinity_' :: omp_display_affinity
+!dec$ attributes alias:'_omp_capture_affinity_' :: omp_capture_affinity
+
+!dec$ attributes alias:'_kmp_set_stacksize_'::kmp_set_stacksize
+!dec$ attributes alias:'_kmp_set_stacksize_s_'::kmp_set_stacksize_s
+!dec$ attributes alias:'_kmp_set_blocktime_'::kmp_set_blocktime
+!dec$ attributes alias:'_kmp_set_library_serial_'::kmp_set_library_serial
+!dec$ attributes alias:'_kmp_set_library_turnaround_'::kmp_set_library_turnaround
+!dec$ attributes alias:'_kmp_set_library_throughput_'::kmp_set_library_throughput
+!dec$ attributes alias:'_kmp_set_library_'::kmp_set_library
+!dec$ attributes alias:'_kmp_get_stacksize_'::kmp_get_stacksize
+!dec$ attributes alias:'_kmp_get_stacksize_s_'::kmp_get_stacksize_s
+!dec$ attributes alias:'_kmp_get_blocktime_'::kmp_get_blocktime
+!dec$ attributes alias:'_kmp_get_library_'::kmp_get_library
+!dec$ attributes alias:'_kmp_set_affinity_'::kmp_set_affinity
+!dec$ attributes alias:'_kmp_get_affinity_'::kmp_get_affinity
+!dec$ attributes alias:'_kmp_get_affinity_max_proc_'::kmp_get_affinity_max_proc
+!dec$ attributes alias:'_kmp_create_affinity_mask_'::kmp_create_affinity_mask
+!dec$ attributes alias:'_kmp_destroy_affinity_mask_'::kmp_destroy_affinity_mask
+!dec$ attributes alias:'_kmp_set_affinity_mask_proc_'::kmp_set_affinity_mask_proc
+!dec$ attributes alias:'_kmp_unset_affinity_mask_proc_'::kmp_unset_affinity_mask_proc
+!dec$ attributes alias:'_kmp_get_affinity_mask_proc_'::kmp_get_affinity_mask_proc
+!dec$ attributes alias:'_kmp_malloc_'::kmp_malloc
+!dec$ attributes alias:'_kmp_aligned_malloc_'::kmp_aligned_malloc
+!dec$ attributes alias:'_kmp_calloc_'::kmp_calloc
+!dec$ attributes alias:'_kmp_realloc_'::kmp_realloc
+!dec$ attributes alias:'_kmp_free_'::kmp_free
+
+!dec$ attributes alias:'_kmp_set_warnings_on_'::kmp_set_warnings_on
+!dec$ attributes alias:'_kmp_set_warnings_off_'::kmp_set_warnings_off
+
+!dec$ attributes alias:'_kmp_get_cancellation_status_'::kmp_get_cancellation_status
+
+!dec$ endif
+
+      end module omp_lib
diff --git a/final/runtime/src/include/omp_lib.f90.var b/final/runtime/src/include/omp_lib.f90.var
new file mode 100644
index 0000000..ac56848
--- /dev/null
+++ b/final/runtime/src/include/omp_lib.f90.var
@@ -0,0 +1,683 @@
+! include/omp_lib.f90.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+!// See https://llvm.org/LICENSE.txt for license information.
+!// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+      module omp_lib_kinds
+
+        use, intrinsic :: iso_c_binding
+
+        integer, parameter :: omp_integer_kind       = c_int
+        integer, parameter :: omp_logical_kind       = 4
+        integer, parameter :: omp_real_kind          = c_float
+        integer, parameter :: kmp_double_kind        = c_double
+        integer, parameter :: omp_lock_kind          = c_intptr_t
+        integer, parameter :: omp_nest_lock_kind     = c_intptr_t
+        integer, parameter :: omp_sched_kind         = omp_integer_kind
+        integer, parameter :: omp_proc_bind_kind     = omp_integer_kind
+        integer, parameter :: kmp_pointer_kind       = c_intptr_t
+        integer, parameter :: kmp_size_t_kind        = c_size_t
+        integer, parameter :: kmp_affinity_mask_kind = c_intptr_t
+        integer, parameter :: kmp_cancel_kind        = omp_integer_kind
+        integer, parameter :: omp_sync_hint_kind     = omp_integer_kind
+        integer, parameter :: omp_lock_hint_kind     = omp_sync_hint_kind
+        integer, parameter :: omp_control_tool_kind  = omp_integer_kind
+        integer, parameter :: omp_control_tool_result_kind = omp_integer_kind
+        integer, parameter :: omp_allocator_handle_kind = c_intptr_t
+        integer, parameter :: omp_memspace_handle_kind = c_intptr_t
+        integer, parameter :: omp_alloctrait_key_kind = omp_integer_kind
+        integer, parameter :: omp_alloctrait_val_kind = c_intptr_t
+
+        type omp_alloctrait
+          integer(kind=omp_alloctrait_key_kind) key
+          integer(kind=omp_alloctrait_val_kind) value
+        end type omp_alloctrait
+
+        integer, parameter :: omp_pause_resource_kind = omp_integer_kind
+        integer, parameter :: omp_depend_kind = c_intptr_t
+        integer, parameter :: omp_event_handle_kind = c_intptr_t
+
+      end module omp_lib_kinds
+
+      module omp_lib
+
+        use omp_lib_kinds
+
+        integer (kind=omp_integer_kind), parameter :: openmp_version    = @LIBOMP_OMP_YEAR_MONTH@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_major = @LIBOMP_VERSION_MAJOR@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_minor = @LIBOMP_VERSION_MINOR@
+        integer (kind=omp_integer_kind), parameter :: kmp_version_build = @LIBOMP_VERSION_BUILD@
+        character(*)               kmp_build_date
+        parameter( kmp_build_date = '@LIBOMP_BUILD_DATE@' )
+
+        integer(kind=omp_sched_kind), parameter :: omp_sched_static  = 1
+        integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
+        integer(kind=omp_sched_kind), parameter :: omp_sched_guided  = 3
+        integer(kind=omp_sched_kind), parameter :: omp_sched_auto    = 4
+        integer(kind=omp_sched_kind), parameter :: omp_sched_monotonic = int(Z'80000000', kind=omp_sched_kind)
+
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
+        integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
+        integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
+
+        integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_none           = 0
+        integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_uncontended    = 1
+        integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_contended      = 2
+        integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_nonspeculative = 4
+        integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_speculative    = 8
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_none = omp_sync_hint_none
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_uncontended = omp_sync_hint_uncontended
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_contended = omp_sync_hint_contended
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative
+        integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_speculative = omp_sync_hint_speculative
+        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_hle         = 65536
+        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_rtm         = 131072
+        integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive    = 262144
+
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_start = 1
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_pause = 2
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_flush = 3
+        integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_end = 4
+
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_notool = -2
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_nocallback = -1
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_success = 0
+        integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_ignored = 1
+
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_threadmodel = 1
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_alignment = 2
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_access = 3
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pool_size = 4
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fallback = 5
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fb_data = 6
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pinned = 7
+        integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_partition = 8
+
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_false = 0
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_true = 1
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default = 2
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_contended = 3
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_uncontended = 4
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_sequential = 5
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_private = 6
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_all = 7
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_thread = 8
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_pteam = 9
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_cgroup = 10
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default_mem_fb = 11
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_null_fb = 12
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_abort_fb = 13
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_allocator_fb = 14
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_environment = 15
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_nearest = 16
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_blocked = 17
+        integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_interleaved = 18
+
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_null_allocator = 0
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_default_mem_alloc = 1
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_large_cap_mem_alloc = 2
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_const_mem_alloc = 3
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_high_bw_mem_alloc = 4
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_low_lat_mem_alloc = 5
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_cgroup_mem_alloc = 6
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_pteam_mem_alloc = 7
+        integer (kind=omp_allocator_handle_kind), parameter :: omp_thread_mem_alloc = 8
+
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_default_mem_space = 0
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_large_cap_mem_space = 1
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_const_mem_space = 2
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_high_bw_mem_space = 3
+        integer (kind=omp_memspace_handle_kind), parameter :: omp_low_lat_mem_space = 4
+
+        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_resume = 0
+        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_soft = 1
+        integer (kind=omp_pause_resource_kind), parameter :: omp_pause_hard = 2
+
+        interface
+
+!         ***
+!         *** omp_* entry points
+!         ***
+
+          subroutine omp_set_num_threads(num_threads) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: num_threads
+          end subroutine omp_set_num_threads
+
+          subroutine omp_set_dynamic(dynamic_threads) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: dynamic_threads
+          end subroutine omp_set_dynamic
+
+          subroutine omp_set_nested(nested) bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind), value :: nested
+          end subroutine omp_set_nested
+
+          function omp_get_num_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_threads
+          end function omp_get_num_threads
+
+          function omp_get_max_threads() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_threads
+          end function omp_get_max_threads
+
+          function omp_get_thread_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_num
+          end function omp_get_thread_num
+
+          function omp_get_num_procs() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_procs
+          end function omp_get_num_procs
+
+          function omp_in_parallel() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_parallel
+          end function omp_in_parallel
+
+          function omp_in_final() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_in_final
+          end function omp_in_final
+
+          function omp_get_dynamic() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_dynamic
+          end function omp_get_dynamic
+
+          function omp_get_nested() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_get_nested
+          end function omp_get_nested
+
+          function omp_get_thread_limit() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_thread_limit
+          end function omp_get_thread_limit
+
+          subroutine omp_set_max_active_levels(max_levels) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: max_levels
+          end subroutine omp_set_max_active_levels
+
+          function omp_get_max_active_levels() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_active_levels
+          end function omp_get_max_active_levels
+
+          function omp_get_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_level
+          end function omp_get_level
+
+          function omp_get_active_level() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_active_level
+          end function omp_get_active_level
+
+          function omp_get_ancestor_thread_num(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_ancestor_thread_num
+
+          function omp_get_team_size(level) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_size
+            integer (kind=omp_integer_kind), value :: level
+          end function omp_get_team_size
+
+          subroutine omp_set_schedule(kind, chunk_size) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind), value :: kind
+            integer (kind=omp_integer_kind), value :: chunk_size
+          end subroutine omp_set_schedule
+
+          subroutine omp_get_schedule(kind, chunk_size) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_sched_kind) kind
+            integer (kind=omp_integer_kind) chunk_size
+          end subroutine omp_get_schedule
+
+          function omp_get_proc_bind() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+          end function omp_get_proc_bind
+
+          function omp_get_num_places() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_places
+          end function omp_get_num_places
+
+          function omp_get_place_num_procs(place_num) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: place_num
+            integer (kind=omp_integer_kind) omp_get_place_num_procs
+          end function omp_get_place_num_procs
+
+          subroutine omp_get_place_proc_ids(place_num, ids) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: place_num
+            integer (kind=omp_integer_kind) ids(*)
+          end subroutine omp_get_place_proc_ids
+
+          function omp_get_place_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_place_num
+          end function omp_get_place_num
+
+          function omp_get_partition_num_places() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_partition_num_places
+          end function omp_get_partition_num_places
+
+          subroutine omp_get_partition_place_nums(place_nums) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) place_nums(*)
+          end subroutine omp_get_partition_place_nums
+
+          function omp_get_wtime() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtime
+          end function omp_get_wtime
+
+          function omp_get_wtick() bind(c)
+            use omp_lib_kinds
+            real (kind=kmp_double_kind) omp_get_wtick
+          end function omp_get_wtick
+
+          function omp_get_default_device() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_default_device
+          end function omp_get_default_device
+
+          subroutine omp_set_default_device(device_num) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: device_num
+          end subroutine omp_set_default_device
+
+          function omp_get_num_devices() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_devices
+          end function omp_get_num_devices
+
+          function omp_get_num_teams() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_num_teams
+          end function omp_get_num_teams
+
+          function omp_get_team_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_team_num
+          end function omp_get_team_num
+
+          function omp_get_cancellation() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_cancellation
+          end function omp_get_cancellation
+
+          function omp_is_initial_device() bind(c)
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_is_initial_device
+          end function omp_is_initial_device
+
+          function omp_get_initial_device() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_initial_device
+          end function omp_get_initial_device
+
+          function omp_get_device_num() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_device_num
+          end function omp_get_device_num
+
+          function omp_pause_resource(kind, device_num) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_pause_resource_kind), value :: kind
+            integer (kind=omp_integer_kind), value :: device_num
+            integer (kind=omp_integer_kind) omp_pause_resource
+          end function omp_pause_resource
+
+          function omp_pause_resource_all(kind) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_pause_resource_kind), value :: kind
+            integer (kind=omp_integer_kind) omp_pause_resource_all
+          end function omp_pause_resource_all
+
+          function omp_get_supported_active_levels() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_supported_active_levels
+          end function omp_get_supported_active_levels
+
+          subroutine omp_fulfill_event(event) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_event_handle_kind), value :: event
+          end subroutine omp_fulfill_event
+
+          subroutine omp_init_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_init_lock
+
+          subroutine omp_destroy_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_destroy_lock
+
+          subroutine omp_set_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_set_lock
+
+          subroutine omp_unset_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+          end subroutine omp_unset_lock
+
+          function omp_test_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            logical (kind=omp_logical_kind) omp_test_lock
+            integer (kind=omp_lock_kind) svar
+          end function omp_test_lock
+
+          subroutine omp_init_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_init_nest_lock
+
+          subroutine omp_destroy_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_destroy_nest_lock
+
+          subroutine omp_set_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_set_nest_lock
+
+          subroutine omp_unset_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+          end subroutine omp_unset_nest_lock
+
+          function omp_test_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_test_nest_lock
+            integer (kind=omp_nest_lock_kind) nvar
+          end function omp_test_nest_lock
+
+          function omp_get_max_task_priority() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_get_max_task_priority
+          end function omp_get_max_task_priority
+
+          function omp_init_allocator(memspace, ntraits, traits)
+            use omp_lib_kinds
+            integer (kind=omp_allocator_handle_kind) omp_init_allocator
+            integer (kind=omp_memspace_handle_kind) :: memspace
+            integer (kind=omp_integer_kind) :: ntraits
+            type(omp_alloctrait), intent(in) :: traits(*)
+          end function omp_init_allocator
+
+          subroutine omp_destroy_allocator(allocator) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_allocator_handle_kind), value :: allocator
+          end subroutine omp_destroy_allocator
+
+          subroutine omp_set_default_allocator(allocator) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_allocator_handle_kind), value :: allocator
+          end subroutine omp_set_default_allocator
+
+          function omp_get_default_allocator() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_allocator_handle_kind) omp_get_default_allocator
+          end function omp_get_default_allocator
+
+          subroutine omp_set_affinity_format(format)
+            character (len=*) :: format
+          end subroutine omp_set_affinity_format
+
+          function omp_get_affinity_format(buffer)
+            use omp_lib_kinds
+            character (len=*) :: buffer
+            integer (kind=kmp_size_t_kind) :: omp_get_affinity_format
+          end function omp_get_affinity_format
+
+          subroutine omp_display_affinity(format)
+            character (len=*) :: format
+          end subroutine omp_display_affinity
+
+          function omp_capture_affinity(buffer, format)
+            use omp_lib_kinds
+            character (len=*) :: format
+            character (len=*) :: buffer
+            integer (kind=kmp_size_t_kind) :: omp_capture_affinity
+          end function omp_capture_affinity
+
+!         ***
+!         *** kmp_* entry points
+!         ***
+
+          subroutine kmp_set_stacksize(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: size
+          end subroutine kmp_set_stacksize
+
+          subroutine kmp_set_stacksize_s(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind), value :: size
+          end subroutine kmp_set_stacksize_s
+
+          subroutine kmp_set_blocktime(msec) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: msec
+          end subroutine kmp_set_blocktime
+
+          subroutine kmp_set_library_serial() bind(c)
+          end subroutine kmp_set_library_serial
+
+          subroutine kmp_set_library_turnaround() bind(c)
+          end subroutine kmp_set_library_turnaround
+
+          subroutine kmp_set_library_throughput() bind(c)
+          end subroutine kmp_set_library_throughput
+
+          subroutine kmp_set_library(libnum) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: libnum
+          end subroutine kmp_set_library
+
+          subroutine kmp_set_defaults(string) bind(c)
+            use, intrinsic :: iso_c_binding
+            character (kind=c_char) :: string(*)
+          end subroutine kmp_set_defaults
+
+          function kmp_get_stacksize() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_stacksize
+          end function kmp_get_stacksize
+
+          function kmp_get_stacksize_s() bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+          end function kmp_get_stacksize_s
+
+          function kmp_get_blocktime() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_blocktime
+          end function kmp_get_blocktime
+
+          function kmp_get_library() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_library
+          end function kmp_get_library
+
+          subroutine kmp_set_disp_num_buffers(num) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind), value :: num
+          end subroutine kmp_set_disp_num_buffers
+
+          function kmp_set_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity
+
+          function kmp_get_affinity(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity
+
+          function kmp_get_affinity_max_proc() bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+          end function kmp_get_affinity_max_proc
+
+          subroutine kmp_create_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_create_affinity_mask
+
+          subroutine kmp_destroy_affinity_mask(mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_affinity_mask_kind) mask
+          end subroutine kmp_destroy_affinity_mask
+
+          function kmp_set_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_set_affinity_mask_proc
+
+          function kmp_unset_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_unset_affinity_mask_proc
+
+          function kmp_get_affinity_mask_proc(proc, mask) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+            integer (kind=omp_integer_kind), value :: proc
+            integer (kind=kmp_affinity_mask_kind) mask
+          end function kmp_get_affinity_mask_proc
+
+          function kmp_malloc(size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_malloc
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_malloc
+
+          function kmp_aligned_malloc(size, alignment) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_aligned_malloc
+            integer (kind=kmp_size_t_kind), value :: size
+            integer (kind=kmp_size_t_kind), value :: alignment
+          end function kmp_aligned_malloc
+
+          function kmp_calloc(nelem, elsize) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_calloc
+            integer (kind=kmp_size_t_kind), value :: nelem
+            integer (kind=kmp_size_t_kind), value :: elsize
+          end function kmp_calloc
+
+          function kmp_realloc(ptr, size) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind) kmp_realloc
+            integer (kind=kmp_pointer_kind), value :: ptr
+            integer (kind=kmp_size_t_kind), value :: size
+          end function kmp_realloc
+
+          subroutine kmp_free(ptr) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_pointer_kind), value :: ptr
+          end subroutine kmp_free
+
+          subroutine kmp_set_warnings_on() bind(c)
+          end subroutine kmp_set_warnings_on
+
+          subroutine kmp_set_warnings_off() bind(c)
+          end subroutine kmp_set_warnings_off
+
+          function kmp_get_cancellation_status(cancelkind) bind(c)
+            use omp_lib_kinds
+            integer (kind=kmp_cancel_kind), value :: cancelkind
+            logical (kind=omp_logical_kind) kmp_get_cancellation_status
+          end function kmp_get_cancellation_status
+
+          subroutine omp_init_lock_with_hint(svar, hint) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_lock_kind) svar
+            integer (kind=omp_lock_hint_kind), value :: hint
+          end subroutine omp_init_lock_with_hint
+
+          subroutine omp_init_nest_lock_with_hint(nvar, hint) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_nest_lock_kind) nvar
+            integer (kind=omp_lock_hint_kind), value :: hint
+          end subroutine omp_init_nest_lock_with_hint
+
+          function omp_control_tool(command, modifier, arg) bind(c)
+            use omp_lib_kinds
+            integer (kind=omp_integer_kind) omp_control_tool
+            integer (kind=omp_control_tool_kind), value :: command
+            integer (kind=omp_control_tool_kind), value :: modifier
+            integer (kind=kmp_pointer_kind), optional :: arg
+          end function omp_control_tool
+
+        end interface
+
+      end module omp_lib
diff --git a/final/runtime/src/include/omp_lib.h.var b/final/runtime/src/include/omp_lib.h.var
new file mode 100644
index 0000000..8775128
--- /dev/null
+++ b/final/runtime/src/include/omp_lib.h.var
@@ -0,0 +1,920 @@
+! include/omp_lib.h.var
+
+!
+!//===----------------------------------------------------------------------===//
+!//
+!// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+!// See https://llvm.org/LICENSE.txt for license information.
+!// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+!//
+!//===----------------------------------------------------------------------===//
+!
+
+      integer omp_integer_kind
+      parameter(omp_integer_kind=4)
+      integer omp_logical_kind
+      parameter(omp_logical_kind=4)
+      integer omp_real_kind
+      parameter(omp_real_kind=4)
+      integer omp_lock_kind
+      parameter(omp_lock_kind=int_ptr_kind())
+      integer omp_nest_lock_kind
+      parameter(omp_nest_lock_kind=int_ptr_kind())
+      integer omp_sched_kind
+      parameter(omp_sched_kind=omp_integer_kind)
+      integer omp_proc_bind_kind
+      parameter(omp_proc_bind_kind=omp_integer_kind)
+      integer kmp_pointer_kind
+      parameter(kmp_pointer_kind=int_ptr_kind())
+      integer kmp_size_t_kind
+      parameter(kmp_size_t_kind=int_ptr_kind())
+      integer kmp_affinity_mask_kind
+      parameter(kmp_affinity_mask_kind=int_ptr_kind())
+      integer omp_sync_hint_kind
+      parameter(omp_sync_hint_kind=omp_integer_kind)
+      integer omp_lock_hint_kind
+      parameter(omp_lock_hint_kind=omp_sync_hint_kind)
+      integer omp_control_tool_kind
+      parameter(omp_control_tool_kind=omp_integer_kind)
+      integer omp_control_tool_result_kind
+      parameter(omp_control_tool_result_kind=omp_integer_kind)
+      integer omp_allocator_handle_kind
+      parameter(omp_allocator_handle_kind=int_ptr_kind())
+      integer omp_memspace_handle_kind
+      parameter(omp_memspace_handle_kind=int_ptr_kind())
+      integer omp_alloctrait_key_kind
+      parameter(omp_alloctrait_key_kind=omp_integer_kind)
+      integer omp_alloctrait_val_kind
+      parameter(omp_alloctrait_val_kind=int_ptr_kind())
+      integer omp_pause_resource_kind
+      parameter(omp_pause_resource_kind=omp_integer_kind)
+      integer omp_depend_kind
+      parameter(omp_depend_kind=int_ptr_kind())
+      integer omp_event_handle_kind
+      parameter(omp_event_handle_kind=int_ptr_kind())
+
+      integer(kind=omp_integer_kind)openmp_version
+      parameter(openmp_version=@LIBOMP_OMP_YEAR_MONTH@)
+      integer(kind=omp_integer_kind)kmp_version_major
+      parameter(kmp_version_major=@LIBOMP_VERSION_MAJOR@)
+      integer(kind=omp_integer_kind)kmp_version_minor
+      parameter(kmp_version_minor=@LIBOMP_VERSION_MINOR@)
+      integer(kind=omp_integer_kind)kmp_version_build
+      parameter(kmp_version_build=@LIBOMP_VERSION_BUILD@)
+      character(*)kmp_build_date
+      parameter(kmp_build_date='@LIBOMP_BUILD_DATE@')
+
+      integer(kind=omp_sched_kind)omp_sched_static
+      parameter(omp_sched_static=1)
+      integer(kind=omp_sched_kind)omp_sched_dynamic
+      parameter(omp_sched_dynamic=2)
+      integer(kind=omp_sched_kind)omp_sched_guided
+      parameter(omp_sched_guided=3)
+      integer(kind=omp_sched_kind)omp_sched_auto
+      parameter(omp_sched_auto=4)
+      integer(kind=omp_sched_kind)omp_sched_monotonic
+      parameter(omp_sched_monotonic=Z'80000000')
+
+      integer(kind=omp_proc_bind_kind)omp_proc_bind_false
+      parameter(omp_proc_bind_false=0)
+      integer(kind=omp_proc_bind_kind)omp_proc_bind_true
+      parameter(omp_proc_bind_true=1)
+      integer(kind=omp_proc_bind_kind)omp_proc_bind_master
+      parameter(omp_proc_bind_master=2)
+      integer(kind=omp_proc_bind_kind)omp_proc_bind_close
+      parameter(omp_proc_bind_close=3)
+      integer(kind=omp_proc_bind_kind)omp_proc_bind_spread
+      parameter(omp_proc_bind_spread=4)
+
+      integer(kind=omp_sync_hint_kind)omp_sync_hint_none
+      parameter(omp_sync_hint_none=0)
+      integer(kind=omp_sync_hint_kind)omp_sync_hint_uncontended
+      parameter(omp_sync_hint_uncontended=1)
+      integer(kind=omp_sync_hint_kind)omp_sync_hint_contended
+      parameter(omp_sync_hint_contended=2)
+      integer(kind=omp_sync_hint_kind)omp_sync_hint_nonspeculative
+      parameter(omp_sync_hint_nonspeculative=4)
+      integer(kind=omp_sync_hint_kind)omp_sync_hint_speculative
+      parameter(omp_sync_hint_speculative=8)
+      integer(kind=omp_lock_hint_kind)omp_lock_hint_none
+      parameter(omp_lock_hint_none=omp_sync_hint_none)
+      integer(kind=omp_lock_hint_kind)omp_lock_hint_uncontended
+      parameter(omp_lock_hint_uncontended=omp_sync_hint_uncontended)
+      integer(kind=omp_lock_hint_kind)omp_lock_hint_contended
+      parameter(omp_lock_hint_contended=omp_sync_hint_contended)
+      integer(kind=omp_lock_hint_kind)omp_lock_hint_nonspeculative
+      parameter(omp_lock_hint_nonspeculative=4)
+      integer(kind=omp_lock_hint_kind)omp_lock_hint_speculative
+      parameter(omp_lock_hint_speculative=omp_sync_hint_speculative)
+      integer(kind=omp_lock_hint_kind)kmp_lock_hint_hle
+      parameter(kmp_lock_hint_hle=65536)
+      integer(kind=omp_lock_hint_kind)kmp_lock_hint_rtm
+      parameter(kmp_lock_hint_rtm=131072)
+      integer(kind=omp_lock_hint_kind)kmp_lock_hint_adaptive
+      parameter(kmp_lock_hint_adaptive=262144)
+
+      integer(kind=omp_control_tool_kind)omp_control_tool_start
+      parameter(omp_control_tool_start=1)
+      integer(kind=omp_control_tool_kind)omp_control_tool_pause
+      parameter(omp_control_tool_pause=2)
+      integer(kind=omp_control_tool_kind)omp_control_tool_flush
+      parameter(omp_control_tool_flush=3)
+      integer(kind=omp_control_tool_kind)omp_control_tool_end
+      parameter(omp_control_tool_end=4)
+
+      integer(omp_control_tool_result_kind)omp_control_tool_notool
+      parameter(omp_control_tool_notool=-2)
+      integer(omp_control_tool_result_kind)omp_control_tool_nocallback
+      parameter(omp_control_tool_nocallback=-1)
+      integer(omp_control_tool_result_kind)omp_control_tool_success
+      parameter(omp_control_tool_success=0)
+      integer(omp_control_tool_result_kind)omp_control_tool_ignored
+      parameter(omp_control_tool_ignored=1)
+
+      integer(kind=omp_alloctrait_key_kind)omp_atk_threadmodel
+      parameter(omp_atk_threadmodel=1)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_alignment
+      parameter(omp_atk_alignment=2)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_access
+      parameter(omp_atk_access=3)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_pool_size
+      parameter(omp_atk_pool_size=4)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_fallback
+      parameter(omp_atk_fallback=5)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_fb_data
+      parameter(omp_atk_fb_data=6)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_pinned
+      parameter(omp_atk_pinned=7)
+      integer(kind=omp_alloctrait_key_kind)omp_atk_partition
+      parameter(omp_atk_partition=8)
+
+      ! Reserved for future use
+      integer(kind=omp_alloctrait_val_kind)omp_atv_false
+      parameter(omp_atv_false=0)
+      ! Reserved for future use
+      integer(kind=omp_alloctrait_val_kind)omp_atv_true
+      parameter(omp_atv_true=1)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_default
+      parameter(omp_atv_default=2)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_contended
+      parameter(omp_atv_contended=3)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_uncontended
+      parameter(omp_atv_uncontended=4)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_sequential
+      parameter(omp_atv_sequential=5)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_private
+      parameter(omp_atv_private=6)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_all
+      parameter(omp_atv_all=7)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_thread
+      parameter(omp_atv_thread=8)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_pteam
+      parameter(omp_atv_pteam=9)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_cgroup
+      parameter(omp_atv_cgroup=10)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_default_mem_fb
+      parameter(omp_atv_default_mem_fb=11)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_null_fb
+      parameter(omp_atv_null_fb=12)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_abort_fb
+      parameter(omp_atv_abort_fb=13)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_allocator_fb
+      parameter(omp_atv_allocator_fb=14)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_environment
+      parameter(omp_atv_environment=15)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_nearest
+      parameter(omp_atv_nearest=16)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_blocked
+      parameter(omp_atv_blocked=17)
+      integer(kind=omp_alloctrait_val_kind)omp_atv_interleaved
+      parameter(omp_atv_interleaved=18)
+
+      type omp_alloctrait
+        integer (kind=omp_alloctrait_key_kind) key
+        integer (kind=omp_alloctrait_val_kind) value
+      end type omp_alloctrait
+
+      integer(kind=omp_allocator_handle_kind)omp_null_allocator
+      parameter(omp_null_allocator=0)
+      integer(kind=omp_allocator_handle_kind)omp_default_mem_alloc
+      parameter(omp_default_mem_alloc=1)
+      integer(kind=omp_allocator_handle_kind)omp_large_cap_mem_alloc
+      parameter(omp_large_cap_mem_alloc=2)
+      integer(kind=omp_allocator_handle_kind)omp_const_mem_alloc
+      parameter(omp_const_mem_alloc=3)
+      integer(kind=omp_allocator_handle_kind)omp_high_bw_mem_alloc
+      parameter(omp_high_bw_mem_alloc=4)
+      integer(kind=omp_allocator_handle_kind)omp_low_lat_mem_alloc
+      parameter(omp_low_lat_mem_alloc=5)
+      integer(kind=omp_allocator_handle_kind)omp_cgroup_mem_alloc
+      parameter(omp_cgroup_mem_alloc=6)
+      integer(kind=omp_allocator_handle_kind)omp_pteam_mem_alloc
+      parameter(omp_pteam_mem_alloc=7)
+      integer(kind=omp_allocator_handle_kind)omp_thread_mem_alloc
+      parameter(omp_thread_mem_alloc=8)
+
+      integer(kind=omp_memspace_handle_kind)omp_default_mem_space
+      parameter(omp_default_mem_space=0)
+      integer(kind=omp_memspace_handle_kind)omp_large_cap_mem_space
+      parameter(omp_large_cap_mem_space=1)
+      integer(kind=omp_memspace_handle_kind)omp_const_mem_space
+      parameter(omp_const_mem_space=2)
+      integer(kind=omp_memspace_handle_kind)omp_high_bw_mem_space
+      parameter(omp_high_bw_mem_space=3)
+      integer(kind=omp_memspace_handle_kind)omp_low_lat_mem_space
+      parameter(omp_low_lat_mem_space=4)
+
+      integer(kind=omp_pause_resource_kind)omp_pause_resume
+      parameter(omp_pause_resume=0)
+      integer(kind=omp_pause_resource_kind)omp_pause_soft
+      parameter(omp_pause_soft=1)
+      integer(kind=omp_pause_resource_kind)omp_pause_hard
+      parameter(omp_pause_hard=2)
+
+      interface
+
+!       ***
+!       *** omp_* entry points
+!       ***
+
+        subroutine omp_set_num_threads(num_threads) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: num_threads
+        end subroutine omp_set_num_threads
+
+        subroutine omp_set_dynamic(dynamic_threads) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: dynamic_threads
+        end subroutine omp_set_dynamic
+
+        subroutine omp_set_nested(nested) bind(c)
+          import
+          logical (kind=omp_logical_kind), value :: nested
+        end subroutine omp_set_nested
+
+        function omp_get_num_threads() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_threads
+        end function omp_get_num_threads
+
+        function omp_get_max_threads() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_threads
+        end function omp_get_max_threads
+
+        function omp_get_thread_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_num
+        end function omp_get_thread_num
+
+        function omp_get_num_procs() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_procs
+        end function omp_get_num_procs
+
+        function omp_in_parallel() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_in_parallel
+        end function omp_in_parallel
+
+        function omp_in_final() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_in_final
+        end function omp_in_final
+
+        function omp_get_dynamic() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_get_dynamic
+        end function omp_get_dynamic
+
+        function omp_get_nested() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_get_nested
+        end function omp_get_nested
+
+        function omp_get_thread_limit() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_thread_limit
+        end function omp_get_thread_limit
+
+        subroutine omp_set_max_active_levels(max_levels) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: max_levels
+        end subroutine omp_set_max_active_levels
+
+        function omp_get_max_active_levels() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_active_levels
+        end function omp_get_max_active_levels
+
+        function omp_get_level() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_level
+        end function omp_get_level
+
+        function omp_get_active_level() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_active_level
+        end function omp_get_active_level
+
+        function omp_get_ancestor_thread_num(level) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_ancestor_thread_num
+          integer (kind=omp_integer_kind), value :: level
+        end function omp_get_ancestor_thread_num
+
+        function omp_get_team_size(level) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_team_size
+          integer (kind=omp_integer_kind), value :: level
+        end function omp_get_team_size
+
+        subroutine omp_set_schedule(kind, chunk_size) bind(c)
+          import
+          integer (kind=omp_sched_kind), value :: kind
+          integer (kind=omp_integer_kind), value :: chunk_size
+        end subroutine omp_set_schedule
+
+        subroutine omp_get_schedule(kind, chunk_size) bind(c)
+          import
+          integer (kind=omp_sched_kind) kind
+          integer (kind=omp_integer_kind) chunk_size
+        end subroutine omp_get_schedule
+
+        function omp_get_proc_bind() bind(c)
+          import
+          integer (kind=omp_proc_bind_kind) omp_get_proc_bind
+        end function omp_get_proc_bind
+
+        function omp_get_num_places() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_places
+        end function omp_get_num_places
+
+        function omp_get_place_num_procs(place_num) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: place_num
+          integer (kind=omp_integer_kind) omp_get_place_num_procs
+        end function omp_get_place_num_procs
+
+        subroutine omp_get_place_proc_ids(place_num, ids) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: place_num
+          integer (kind=omp_integer_kind) ids(*)
+        end subroutine omp_get_place_proc_ids
+
+        function omp_get_place_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_place_num
+        end function omp_get_place_num
+
+        function omp_get_partition_num_places() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_partition_num_places
+        end function omp_get_partition_num_places
+
+        subroutine omp_get_partition_place_nums(place_nums) bind(c)
+          import
+          integer (kind=omp_integer_kind) place_nums(*)
+        end subroutine omp_get_partition_place_nums
+
+        function omp_get_wtime() bind(c)
+          double precision omp_get_wtime
+        end function omp_get_wtime
+
+        function omp_get_wtick() bind(c)
+          double precision omp_get_wtick
+        end function omp_get_wtick
+
+        function omp_get_default_device() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_default_device
+        end function omp_get_default_device
+
+        subroutine omp_set_default_device(device_num) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: device_num
+        end subroutine omp_set_default_device
+
+        function omp_get_num_devices() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_devices
+        end function omp_get_num_devices
+
+        function omp_get_num_teams() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_num_teams
+        end function omp_get_num_teams
+
+        function omp_get_team_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_team_num
+        end function omp_get_team_num
+
+        function omp_is_initial_device() bind(c)
+          import
+          logical (kind=omp_logical_kind) omp_is_initial_device
+        end function omp_is_initial_device
+
+        function omp_get_initial_device() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_initial_device
+        end function omp_get_initial_device
+
+        function omp_get_device_num() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_device_num
+        end function omp_get_device_num
+
+        function omp_pause_resource(kind, device_num) bind(c)
+          import
+          integer (kind=omp_pause_resource_kind), value :: kind
+          integer (kind=omp_integer_kind), value :: device_num
+          integer (kind=omp_integer_kind) omp_pause_resource
+        end function omp_pause_resource
+
+        function omp_pause_resource_all(kind) bind(c)
+          import
+          integer (kind=omp_pause_resource_kind), value :: kind
+          integer (kind=omp_integer_kind) omp_pause_resource_all
+        end function omp_pause_resource_all
+
+        function omp_get_supported_active_levels() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_supported_active_levels
+        end function omp_get_supported_active_levels
+
+        subroutine omp_fulfill_event(event) bind(c)
+          import
+          integer (kind=omp_event_handle_kind), value :: event
+        end subroutine omp_fulfill_event
+
+        subroutine omp_init_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) svar
+        end subroutine omp_init_lock
+
+        subroutine omp_destroy_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) svar
+        end subroutine omp_destroy_lock
+
+        subroutine omp_set_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) svar
+        end subroutine omp_set_lock
+
+        subroutine omp_unset_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_lock_kind) svar
+        end subroutine omp_unset_lock
+
+        function omp_test_lock(svar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_lock
+!DIR$ ENDIF
+          import
+          logical (kind=omp_logical_kind) omp_test_lock
+          integer (kind=omp_lock_kind) svar
+        end function omp_test_lock
+
+        subroutine omp_init_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_init_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) nvar
+        end subroutine omp_init_nest_lock
+
+        subroutine omp_destroy_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) nvar
+        end subroutine omp_destroy_nest_lock
+
+        subroutine omp_set_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_set_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) nvar
+        end subroutine omp_set_nest_lock
+
+        subroutine omp_unset_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_nest_lock_kind) nvar
+        end subroutine omp_unset_nest_lock
+
+        function omp_test_nest_lock(nvar) bind(c)
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!DIR$ attributes known_intrinsic :: omp_test_nest_lock
+!DIR$ ENDIF
+          import
+          integer (kind=omp_integer_kind) omp_test_nest_lock
+          integer (kind=omp_nest_lock_kind) nvar
+        end function omp_test_nest_lock
+
+        function omp_get_max_task_priority() bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_get_max_task_priority
+        end function omp_get_max_task_priority
+
+        function omp_init_allocator(memspace, ntraits, traits)
+          import
+          integer (omp_allocator_handle_kind) omp_init_allocator
+          integer (omp_memspace_handle_kind) :: memspace
+          integer (omp_integer_kind) :: ntraits
+          type(omp_alloctrait), intent(in) :: traits(*)
+        end function omp_init_allocator
+
+        subroutine omp_destroy_allocator(allocator) bind(c)
+          import
+          integer (omp_allocator_handle_kind), value :: allocator
+        end subroutine omp_destroy_allocator
+
+        subroutine omp_set_default_allocator(allocator) bind(c)
+          import
+          integer (omp_allocator_handle_kind), value :: allocator
+        end subroutine omp_set_default_allocator
+
+        function omp_get_default_allocator() bind(c)
+          import
+          integer (omp_allocator_handle_kind) omp_get_default_allocator
+        end function omp_get_default_allocator
+
+        subroutine omp_set_affinity_format(format)
+          character (len=*) :: format
+        end subroutine omp_set_affinity_format
+
+        function omp_get_affinity_format(buffer)
+          import
+          character (len=*) :: buffer
+          integer (kind=kmp_size_t_kind) :: omp_get_affinity_format
+        end function omp_get_affinity_format
+
+        subroutine omp_display_affinity(format)
+          character (len=*) :: format
+        end subroutine omp_display_affinity
+
+        function omp_capture_affinity(buffer, format)
+          import
+          character (len=*) :: format
+          character (len=*) :: buffer
+          integer (kind=kmp_size_t_kind) :: omp_capture_affinity
+        end function omp_capture_affinity
+
+!       ***
+!       *** kmp_* entry points
+!       ***
+
+        subroutine kmp_set_stacksize(size) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: size
+        end subroutine kmp_set_stacksize
+
+        subroutine kmp_set_stacksize_s(size) bind(c)
+          import
+          integer (kind=kmp_size_t_kind), value :: size
+        end subroutine kmp_set_stacksize_s
+
+        subroutine kmp_set_blocktime(msec) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: msec
+        end subroutine kmp_set_blocktime
+
+        subroutine kmp_set_library_serial() bind(c)
+        end subroutine kmp_set_library_serial
+
+        subroutine kmp_set_library_turnaround() bind(c)
+        end subroutine kmp_set_library_turnaround
+
+        subroutine kmp_set_library_throughput() bind(c)
+        end subroutine kmp_set_library_throughput
+
+        subroutine kmp_set_library(libnum) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: libnum
+        end subroutine kmp_set_library
+
+        subroutine kmp_set_defaults(string) bind(c)
+          character string(*)
+        end subroutine kmp_set_defaults
+
+        function kmp_get_stacksize() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_stacksize
+        end function kmp_get_stacksize
+
+        function kmp_get_stacksize_s() bind(c)
+          import
+          integer (kind=kmp_size_t_kind) kmp_get_stacksize_s
+        end function kmp_get_stacksize_s
+
+        function kmp_get_blocktime() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_blocktime
+        end function kmp_get_blocktime
+
+        function kmp_get_library() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_library
+        end function kmp_get_library
+
+        subroutine kmp_set_disp_num_buffers(num) bind(c)
+          import
+          integer (kind=omp_integer_kind), value :: num
+        end subroutine kmp_set_disp_num_buffers
+
+        function kmp_set_affinity(mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity
+
+        function kmp_get_affinity(mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity
+
+        function kmp_get_affinity_max_proc() bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_max_proc
+        end function kmp_get_affinity_max_proc
+
+        subroutine kmp_create_affinity_mask(mask) bind(c)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_create_affinity_mask
+
+        subroutine kmp_destroy_affinity_mask(mask) bind(c)
+          import
+          integer (kind=kmp_affinity_mask_kind) mask
+        end subroutine kmp_destroy_affinity_mask
+
+        function kmp_set_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_set_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_set_affinity_mask_proc
+
+        function kmp_unset_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_unset_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_unset_affinity_mask_proc
+
+        function kmp_get_affinity_mask_proc(proc, mask) bind(c)
+          import
+          integer (kind=omp_integer_kind) kmp_get_affinity_mask_proc
+          integer (kind=omp_integer_kind), value :: proc
+          integer (kind=kmp_affinity_mask_kind) mask
+        end function kmp_get_affinity_mask_proc
+
+        function kmp_malloc(size) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_malloc
+          integer (kind=kmp_size_t_kind), value :: size
+        end function kmp_malloc
+
+        function kmp_aligned_malloc(size, alignment) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_aligned_malloc
+          integer (kind=kmp_size_t_kind), value :: size
+          integer (kind=kmp_size_t_kind), value :: alignment
+        end function kmp_aligned_malloc
+
+        function kmp_calloc(nelem, elsize) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_calloc
+          integer (kind=kmp_size_t_kind), value :: nelem
+          integer (kind=kmp_size_t_kind), value :: elsize
+        end function kmp_calloc
+
+        function kmp_realloc(ptr, size) bind(c)
+          import
+          integer (kind=kmp_pointer_kind) kmp_realloc
+          integer (kind=kmp_pointer_kind), value :: ptr
+          integer (kind=kmp_size_t_kind), value :: size
+        end function kmp_realloc
+
+        subroutine kmp_free(ptr) bind(c)
+          import
+          integer (kind=kmp_pointer_kind), value :: ptr
+        end subroutine kmp_free
+
+        subroutine kmp_set_warnings_on() bind(c)
+        end subroutine kmp_set_warnings_on
+
+        subroutine kmp_set_warnings_off() bind(c)
+        end subroutine kmp_set_warnings_off
+
+        subroutine omp_init_lock_with_hint(svar, hint) bind(c)
+          import
+          integer (kind=omp_lock_kind) svar
+          integer (kind=omp_lock_hint_kind), value :: hint
+        end subroutine omp_init_lock_with_hint
+
+        subroutine omp_init_nest_lock_with_hint(nvar, hint) bind(c)
+          import
+          integer (kind=omp_nest_lock_kind) nvar
+          integer (kind=omp_lock_hint_kind), value :: hint
+        end subroutine omp_init_nest_lock_with_hint
+
+        function omp_control_tool(command, modifier, arg) bind(c)
+          import
+          integer (kind=omp_integer_kind) omp_control_tool
+          integer (kind=omp_control_tool_kind), value :: command
+          integer (kind=omp_control_tool_kind), value :: modifier
+          integer (kind=kmp_pointer_kind), optional :: arg
+        end function omp_control_tool
+
+      end interface
+
+!DIR$ IF DEFINED (__INTEL_OFFLOAD)
+
+!DIR$ IF(__INTEL_COMPILER.LT.1900)
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_num_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_dynamic
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_nested
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_max_threads
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_thread_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_procs
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_in_parallel
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_in_final
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_dynamic
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_nested
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_thread_limit
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_max_active_levels
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_max_active_levels
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_level
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_active_level
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_ancestor_thread_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_team_size
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_schedule
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_schedule
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_proc_bind
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_wtime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_wtick
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_default_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_default_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_is_initial_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_initial_device
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_devices
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_device_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_pause_resource
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_pause_resource_all
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_supported_active_levels
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_fulfill_event
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_num_teams
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_team_num
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_destroy_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_unset_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_test_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_destroy_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_unset_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_test_nest_lock
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_max_task_priority
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_set_affinity_format
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_get_affinity_format
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_display_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_capture_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_stacksize
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_stacksize_s
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_blocktime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_serial
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_turnaround
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library_throughput
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_library
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_defaults
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_stacksize
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_stacksize_s
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_blocktime
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_library
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_disp_num_buffers
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity_max_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_create_affinity_mask
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_destroy_affinity_mask
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_unset_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_get_affinity_mask_proc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_malloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_aligned_malloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_calloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_realloc
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_free
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_warnings_on
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: kmp_set_warnings_off
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_lock_with_hint
+!DIR$ ATTRIBUTES OFFLOAD:MIC :: omp_init_nest_lock_with_hint
+!DIR$ ENDIF
+
+!DIR$ IF(__INTEL_COMPILER.GE.1400)
+!$omp declare target(omp_set_num_threads )
+!$omp declare target(omp_set_dynamic )
+!$omp declare target(omp_set_nested )
+!$omp declare target(omp_get_num_threads )
+!$omp declare target(omp_get_max_threads )
+!$omp declare target(omp_get_thread_num )
+!$omp declare target(omp_get_num_procs )
+!$omp declare target(omp_in_parallel )
+!$omp declare target(omp_in_final )
+!$omp declare target(omp_get_dynamic )
+!$omp declare target(omp_get_nested )
+!$omp declare target(omp_get_thread_limit )
+!$omp declare target(omp_set_max_active_levels )
+!$omp declare target(omp_get_max_active_levels )
+!$omp declare target(omp_get_level )
+!$omp declare target(omp_get_active_level )
+!$omp declare target(omp_get_ancestor_thread_num )
+!$omp declare target(omp_get_team_size )
+!$omp declare target(omp_set_schedule )
+!$omp declare target(omp_get_schedule )
+!$omp declare target(omp_get_proc_bind )
+!$omp declare target(omp_get_wtime )
+!$omp declare target(omp_get_wtick )
+!$omp declare target(omp_get_default_device )
+!$omp declare target(omp_set_default_device )
+!$omp declare target(omp_is_initial_device )
+!$omp declare target(omp_get_initial_device )
+!$omp declare target(omp_get_num_devices )
+!$omp declare target(omp_get_device_num )
+!$omp declare target(omp_pause_resource )
+!$omp declare target(omp_pause_resource_all )
+!$omp declare target(omp_get_supported_active_levels )
+!$omp declare target(omp_fulfill_event)
+!$omp declare target(omp_get_num_teams )
+!$omp declare target(omp_get_team_num )
+!$omp declare target(omp_init_lock )
+!$omp declare target(omp_destroy_lock )
+!$omp declare target(omp_set_lock )
+!$omp declare target(omp_unset_lock )
+!$omp declare target(omp_test_lock )
+!$omp declare target(omp_init_nest_lock )
+!$omp declare target(omp_destroy_nest_lock )
+!$omp declare target(omp_set_nest_lock )
+!$omp declare target(omp_unset_nest_lock )
+!$omp declare target(omp_test_nest_lock )
+!$omp declare target(omp_get_max_task_priority )
+!$omp declare target(omp_set_affinity_format )
+!$omp declare target(omp_get_affinity_format )
+!$omp declare target(omp_display_affinity )
+!$omp declare target(omp_capture_affinity )
+!$omp declare target(kmp_set_stacksize )
+!$omp declare target(kmp_set_stacksize_s )
+!$omp declare target(kmp_set_blocktime )
+!$omp declare target(kmp_set_library_serial )
+!$omp declare target(kmp_set_library_turnaround )
+!$omp declare target(kmp_set_library_throughput )
+!$omp declare target(kmp_set_library )
+!$omp declare target(kmp_set_defaults )
+!$omp declare target(kmp_get_stacksize )
+!$omp declare target(kmp_get_stacksize_s )
+!$omp declare target(kmp_get_blocktime )
+!$omp declare target(kmp_get_library )
+!$omp declare target(kmp_set_disp_num_buffers )
+!$omp declare target(kmp_set_affinity )
+!$omp declare target(kmp_get_affinity )
+!$omp declare target(kmp_get_affinity_max_proc )
+!$omp declare target(kmp_create_affinity_mask )
+!$omp declare target(kmp_destroy_affinity_mask )
+!$omp declare target(kmp_set_affinity_mask_proc )
+!$omp declare target(kmp_unset_affinity_mask_proc )
+!$omp declare target(kmp_get_affinity_mask_proc )
+!$omp declare target(kmp_malloc )
+!$omp declare target(kmp_aligned_malloc )
+!$omp declare target(kmp_calloc )
+!$omp declare target(kmp_realloc )
+!$omp declare target(kmp_free )
+!$omp declare target(kmp_set_warnings_on )
+!$omp declare target(kmp_set_warnings_off )
+!$omp declare target(omp_init_lock_with_hint )
+!$omp declare target(omp_init_nest_lock_with_hint )
+!DIR$ ENDIF
+!DIR$ ENDIF
diff --git a/final/runtime/src/kmp.h b/final/runtime/src/kmp.h
new file mode 100644
index 0000000..fdb9dbb
--- /dev/null
+++ b/final/runtime/src/kmp.h
@@ -0,0 +1,3917 @@
+/*! \file */
+/*
+ * kmp.h -- KPTS runtime header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_H
+#define KMP_H
+
+#include "kmp_config.h"
+
+/* #define BUILD_PARALLEL_ORDERED 1 */
+
+/* This fix replaces gettimeofday with clock_gettime for better scalability on
+   the Altix.  Requires user code to be linked with -lrt. */
+//#define FIX_SGI_CLOCK
+
+/* Defines for OpenMP 3.0 tasking and auto scheduling */
+
+#ifndef KMP_STATIC_STEAL_ENABLED
+#define KMP_STATIC_STEAL_ENABLED 1
+#endif
+
+#define TASK_CURRENT_NOT_QUEUED 0
+#define TASK_CURRENT_QUEUED 1
+
+#ifdef BUILD_TIED_TASK_STACK
+#define TASK_STACK_EMPTY 0 // entries when the stack is empty
+#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK
+// Number of entries in each task stack array
+#define TASK_STACK_BLOCK_SIZE (1 << TASK_STACK_BLOCK_BITS)
+// Mask for determining index into stack block
+#define TASK_STACK_INDEX_MASK (TASK_STACK_BLOCK_SIZE - 1)
+#endif // BUILD_TIED_TASK_STACK
+
+#define TASK_NOT_PUSHED 1
+#define TASK_SUCCESSFULLY_PUSHED 0
+#define TASK_TIED 1
+#define TASK_UNTIED 0
+#define TASK_EXPLICIT 1
+#define TASK_IMPLICIT 0
+#define TASK_PROXY 1
+#define TASK_FULL 0
+#define TASK_DETACHABLE 1
+#define TASK_UNDETACHABLE 0
+
+#define KMP_CANCEL_THREADS
+#define KMP_THREAD_ATTR
+
+// Android does not have pthread_cancel.  Undefine KMP_CANCEL_THREADS if being
+// built on Android
+#if defined(__ANDROID__)
+#undef KMP_CANCEL_THREADS
+#endif
+
+#include <signal.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/* include <ctype.h> don't use; problems with /MD on Windows* OS NT due to bad
+   Microsoft library. Some macros provided below to replace these functions  */
+#ifndef __ABSOFT_WIN
+#include <sys/types.h>
+#endif
+#include <limits.h>
+#include <time.h>
+
+#include <errno.h>
+
+#include "kmp_os.h"
+
+#include "kmp_safe_c_api.h"
+
+#if KMP_STATS_ENABLED
+class kmp_stats_list;
+#endif
+
+#if KMP_USE_HIER_SCHED
+// Only include hierarchical scheduling if affinity is supported
+#undef KMP_USE_HIER_SCHED
+#define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
+#endif
+
+#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
+#include "hwloc.h"
+#ifndef HWLOC_OBJ_NUMANODE
+#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
+#endif
+#ifndef HWLOC_OBJ_PACKAGE
+#define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
+#endif
+#if HWLOC_API_VERSION >= 0x00020000
+// hwloc 2.0 changed type of depth of object from unsigned to int
+typedef int kmp_hwloc_depth_t;
+#else
+typedef unsigned int kmp_hwloc_depth_t;
+#endif
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#include <xmmintrin.h>
+#endif
+
+#include "kmp_debug.h"
+#include "kmp_lock.h"
+#include "kmp_version.h"
+#if USE_DEBUGGER
+#include "kmp_debugger.h"
+#endif
+#include "kmp_i18n.h"
+
+#define KMP_HANDLE_SIGNALS (KMP_OS_UNIX || KMP_OS_WINDOWS)
+
+#include "kmp_wrapper_malloc.h"
+#if KMP_OS_UNIX
+#include <unistd.h>
+#if !defined NSIG && defined _NSIG
+#define NSIG _NSIG
+#endif
+#endif
+
+#if KMP_OS_LINUX
+#pragma weak clock_gettime
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#endif
+
+// Affinity format function
+#include "kmp_str.h"
+
+// 0 - no fast memory allocation, alignment: 8-byte on x86, 16-byte on x64.
+// 3 - fast allocation using sync, non-sync free lists of any size, non-self
+// free lists of limited size.
+#ifndef USE_FAST_MEMORY
+#define USE_FAST_MEMORY 3
+#endif
+
+#ifndef KMP_NESTED_HOT_TEAMS
+#define KMP_NESTED_HOT_TEAMS 0
+#define USE_NESTED_HOT_ARG(x)
+#else
+#if KMP_NESTED_HOT_TEAMS
+#define USE_NESTED_HOT_ARG(x) , x
+#else
+#define USE_NESTED_HOT_ARG(x)
+#endif
+#endif
+
+// Assume using BGET compare_exchange instruction instead of lock by default.
+#ifndef USE_CMP_XCHG_FOR_BGET
+#define USE_CMP_XCHG_FOR_BGET 1
+#endif
+
+// Test to see if queuing lock is better than bootstrap lock for bget
+// #ifndef USE_QUEUING_LOCK_FOR_BGET
+// #define USE_QUEUING_LOCK_FOR_BGET
+// #endif
+
+#define KMP_NSEC_PER_SEC 1000000000L
+#define KMP_USEC_PER_SEC 1000000L
+
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+Values for bit flags used in the ident_t to describe the fields.
+*/
+enum {
+  /*! Use trampoline for internal microtasks */
+  KMP_IDENT_IMB = 0x01,
+  /*! Use c-style ident structure */
+  KMP_IDENT_KMPC = 0x02,
+  /* 0x04 is no longer used */
+  /*! Entry point generated by auto-parallelization */
+  KMP_IDENT_AUTOPAR = 0x08,
+  /*! Compiler generates atomic reduction option for kmpc_reduce* */
+  KMP_IDENT_ATOMIC_REDUCE = 0x10,
+  /*! To mark a 'barrier' directive in user code */
+  KMP_IDENT_BARRIER_EXPL = 0x20,
+  /*! To Mark implicit barriers. */
+  KMP_IDENT_BARRIER_IMPL = 0x0040,
+  KMP_IDENT_BARRIER_IMPL_MASK = 0x01C0,
+  KMP_IDENT_BARRIER_IMPL_FOR = 0x0040,
+  KMP_IDENT_BARRIER_IMPL_SECTIONS = 0x00C0,
+
+  KMP_IDENT_BARRIER_IMPL_SINGLE = 0x0140,
+  KMP_IDENT_BARRIER_IMPL_WORKSHARE = 0x01C0,
+
+  /*! To mark a static loop in OMPT callbacks */
+  KMP_IDENT_WORK_LOOP = 0x200,
+  /*! To mark a sections directive in OMPT callbacks */
+  KMP_IDENT_WORK_SECTIONS = 0x400,
+  /*! To mark a distirbute construct in OMPT callbacks */
+  KMP_IDENT_WORK_DISTRIBUTE = 0x800,
+  /*! Atomic hint; bottom four bits as omp_sync_hint_t. Top four reserved and
+      not currently used. If one day we need more bits, then we can use
+      an invalid combination of hints to mean that another, larger field
+      should be used in a different flag. */
+  KMP_IDENT_ATOMIC_HINT_MASK = 0xFF0000,
+  KMP_IDENT_ATOMIC_HINT_UNCONTENDED = 0x010000,
+  KMP_IDENT_ATOMIC_HINT_CONTENDED = 0x020000,
+  KMP_IDENT_ATOMIC_HINT_NONSPECULATIVE = 0x040000,
+  KMP_IDENT_ATOMIC_HINT_SPECULATIVE = 0x080000,
+};
+
+/*!
+ * The ident structure that describes a source location.
+ */
+typedef struct ident {
+  kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
+  kmp_int32 flags; /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                      identifies this union member  */
+  kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+/*  but currently used for storing region-specific ITT */
+/*  contextual information. */
+#endif /* USE_ITT_BUILD */
+  kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++  */
+  char const *psource; /**< String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
+} ident_t;
+/*!
+@}
+*/
+
+// Some forward declarations.
+typedef union kmp_team kmp_team_t;
+typedef struct kmp_taskdata kmp_taskdata_t;
+typedef union kmp_task_team kmp_task_team_t;
+typedef union kmp_team kmp_team_p;
+typedef union kmp_info kmp_info_p;
+typedef union kmp_root kmp_root_p;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+/* Pack two 32-bit signed integers into a 64-bit signed integer */
+/* ToDo: Fix word ordering for big-endian machines. */
+#define KMP_PACK_64(HIGH_32, LOW_32)                                           \
+  ((kmp_int64)((((kmp_uint64)(HIGH_32)) << 32) | (kmp_uint64)(LOW_32)))
+
+// Generic string manipulation macros. Assume that _x is of type char *
+#define SKIP_WS(_x)                                                            \
+  {                                                                            \
+    while (*(_x) == ' ' || *(_x) == '\t')                                      \
+      (_x)++;                                                                  \
+  }
+#define SKIP_DIGITS(_x)                                                        \
+  {                                                                            \
+    while (*(_x) >= '0' && *(_x) <= '9')                                       \
+      (_x)++;                                                                  \
+  }
+#define SKIP_TOKEN(_x)                                                         \
+  {                                                                            \
+    while ((*(_x) >= '0' && *(_x) <= '9') || (*(_x) >= 'a' && *(_x) <= 'z') || \
+           (*(_x) >= 'A' && *(_x) <= 'Z') || *(_x) == '_')                     \
+      (_x)++;                                                                  \
+  }
+#define SKIP_TO(_x, _c)                                                        \
+  {                                                                            \
+    while (*(_x) != '\0' && *(_x) != (_c))                                     \
+      (_x)++;                                                                  \
+  }
+
+/* ------------------------------------------------------------------------ */
+
+#define KMP_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+/* ------------------------------------------------------------------------ */
+/* Enumeration types */
+
+enum kmp_state_timer {
+  ts_stop,
+  ts_start,
+  ts_pause,
+
+  ts_last_state
+};
+
+enum dynamic_mode {
+  dynamic_default,
+#ifdef USE_LOAD_BALANCE
+  dynamic_load_balance,
+#endif /* USE_LOAD_BALANCE */
+  dynamic_random,
+  dynamic_thread_limit,
+  dynamic_max
+};
+
+/* external schedule constants, duplicate enum omp_sched in omp.h in order to
+ * not include it here */
+#ifndef KMP_SCHED_TYPE_DEFINED
+#define KMP_SCHED_TYPE_DEFINED
+typedef enum kmp_sched {
+  kmp_sched_lower = 0, // lower and upper bounds are for routine parameter check
+  // Note: need to adjust __kmp_sch_map global array in case enum is changed
+  kmp_sched_static = 1, // mapped to kmp_sch_static_chunked           (33)
+  kmp_sched_dynamic = 2, // mapped to kmp_sch_dynamic_chunked          (35)
+  kmp_sched_guided = 3, // mapped to kmp_sch_guided_chunked           (36)
+  kmp_sched_auto = 4, // mapped to kmp_sch_auto                     (38)
+  kmp_sched_upper_std = 5, // upper bound for standard schedules
+  kmp_sched_lower_ext = 100, // lower bound of Intel extension schedules
+  kmp_sched_trapezoidal = 101, // mapped to kmp_sch_trapezoidal (39)
+#if KMP_STATIC_STEAL_ENABLED
+  kmp_sched_static_steal = 102, // mapped to kmp_sch_static_steal (44)
+#endif
+  kmp_sched_upper,
+  kmp_sched_default = kmp_sched_static, // default scheduling
+  kmp_sched_monotonic = 0x80000000
+} kmp_sched_t;
+#endif
+
+/*!
+ @ingroup WORK_SHARING
+ * Describes the loop schedule to be used for a parallel for loop.
+ */
+enum sched_type : kmp_int32 {
+  kmp_sch_lower = 32, /**< lower bound for unordered values */
+  kmp_sch_static_chunked = 33,
+  kmp_sch_static = 34, /**< static unspecialized */
+  kmp_sch_dynamic_chunked = 35,
+  kmp_sch_guided_chunked = 36, /**< guided unspecialized */
+  kmp_sch_runtime = 37,
+  kmp_sch_auto = 38, /**< auto */
+  kmp_sch_trapezoidal = 39,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_greedy = 40,
+  kmp_sch_static_balanced = 41,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_guided_iterative_chunked = 42,
+  kmp_sch_guided_analytical_chunked = 43,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_steal = 44,
+
+  /* static with chunk adjustment (e.g., simd) */
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46, /**< guided with chunk adjustment */
+  kmp_sch_runtime_simd = 47, /**< runtime with chunk adjustment */
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_upper, /**< upper bound for unordered values */
+
+  kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */
+  kmp_ord_static_chunked = 65,
+  kmp_ord_static = 66, /**< ordered static unspecialized */
+  kmp_ord_dynamic_chunked = 67,
+  kmp_ord_guided_chunked = 68,
+  kmp_ord_runtime = 69,
+  kmp_ord_auto = 70, /**< ordered auto */
+  kmp_ord_trapezoidal = 71,
+  kmp_ord_upper, /**< upper bound for ordered values */
+
+  /* Schedules for Distribute construct */
+  kmp_distribute_static_chunked = 91, /**< distribute static chunked */
+  kmp_distribute_static = 92, /**< distribute static unspecialized */
+
+  /* For the "nomerge" versions, kmp_dispatch_next*() will always return a
+     single iteration/chunk, even if the loop is serialized. For the schedule
+     types listed above, the entire iteration vector is returned if the loop is
+     serialized. This doesn't work for gcc/gcomp sections. */
+  kmp_nm_lower = 160, /**< lower bound for nomerge values */
+
+  kmp_nm_static_chunked =
+      (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower),
+  kmp_nm_static = 162, /**< static unspecialized */
+  kmp_nm_dynamic_chunked = 163,
+  kmp_nm_guided_chunked = 164, /**< guided unspecialized */
+  kmp_nm_runtime = 165,
+  kmp_nm_auto = 166, /**< auto */
+  kmp_nm_trapezoidal = 167,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_static_greedy = 168,
+  kmp_nm_static_balanced = 169,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_guided_iterative_chunked = 170,
+  kmp_nm_guided_analytical_chunked = 171,
+  kmp_nm_static_steal =
+      172, /* accessible only through OMP_SCHEDULE environment variable */
+
+  kmp_nm_ord_static_chunked = 193,
+  kmp_nm_ord_static = 194, /**< ordered static unspecialized */
+  kmp_nm_ord_dynamic_chunked = 195,
+  kmp_nm_ord_guided_chunked = 196,
+  kmp_nm_ord_runtime = 197,
+  kmp_nm_ord_auto = 198, /**< auto */
+  kmp_nm_ord_trapezoidal = 199,
+  kmp_nm_upper, /**< upper bound for nomerge values */
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. Since
+     we need to distinguish the three possible cases (no modifier, monotonic
+     modifier, nonmonotonic modifier), we need separate bits for each modifier.
+     The absence of monotonic does not imply nonmonotonic, especially since 4.5
+     says that the behaviour of the "no modifier" case is implementation defined
+     in 4.5, but will become "nonmonotonic" in 5.0.
+
+     Since we're passing a full 32 bit value, we can use a couple of high bits
+     for these flags; out of paranoia we avoid the sign bit.
+
+     These modifiers can be or-ed into non-static schedules by the compiler to
+     pass the additional information. They will be stripped early in the
+     processing in __kmp_dispatch_init when setting up schedules, so most of the
+     code won't ever see schedules with these bits set.  */
+  kmp_sch_modifier_monotonic =
+      (1 << 29), /**< Set if the monotonic schedule modifier was present */
+  kmp_sch_modifier_nonmonotonic =
+      (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum sched_type)(                                                           \
+      (s) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sch_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
+#define SCHEDULE_GET_MODIFIERS(s)                                              \
+  ((enum sched_type)(                                                          \
+      (s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)))
+#define SCHEDULE_SET_MODIFIERS(s, m)                                           \
+  (s = (enum sched_type)((kmp_int32)s | (kmp_int32)m))
+#define SCHEDULE_NONMONOTONIC 0
+#define SCHEDULE_MONOTONIC 1
+
+  kmp_sch_default = kmp_sch_static /**< default scheduling algorithm */
+};
+
+// Apply modifiers on internal kind to standard kind
+static inline void
+__kmp_sched_apply_mods_stdkind(kmp_sched_t *kind,
+                               enum sched_type internal_kind) {
+  if (SCHEDULE_HAS_MONOTONIC(internal_kind)) {
+    *kind = (kmp_sched_t)((int)*kind | (int)kmp_sched_monotonic);
+  }
+}
+
+// Apply modifiers on standard kind to internal kind
+static inline void
+__kmp_sched_apply_mods_intkind(kmp_sched_t kind,
+                               enum sched_type *internal_kind) {
+  if ((int)kind & (int)kmp_sched_monotonic) {
+    *internal_kind = (enum sched_type)((int)*internal_kind |
+                                       (int)kmp_sch_modifier_monotonic);
+  }
+}
+
+// Get standard schedule without modifiers
+static inline kmp_sched_t __kmp_sched_without_mods(kmp_sched_t kind) {
+  return (kmp_sched_t)((int)kind & ~((int)kmp_sched_monotonic));
+}
+
+/* Type to keep runtime schedule set via OMP_SCHEDULE or omp_set_schedule() */
+typedef union kmp_r_sched {
+  struct {
+    enum sched_type r_sched_type;
+    int chunk;
+  };
+  kmp_int64 sched;
+} kmp_r_sched_t;
+
+extern enum sched_type __kmp_sch_map[]; // map OMP 3.0 schedule types with our
+// internal schedule types
+
+enum library_type {
+  library_none,
+  library_serial,
+  library_turnaround,
+  library_throughput
+};
+
+#if KMP_OS_LINUX
+enum clock_function_type {
+  clock_function_gettimeofday,
+  clock_function_clock_gettime
+};
+#endif /* KMP_OS_LINUX */
+
+#if KMP_MIC_SUPPORTED
+enum mic_type { non_mic, mic1, mic2, mic3, dummy };
+#endif
+
+/* -- fast reduction stuff ------------------------------------------------ */
+
+#undef KMP_FAST_REDUCTION_BARRIER
+#define KMP_FAST_REDUCTION_BARRIER 1
+
+#undef KMP_FAST_REDUCTION_CORE_DUO
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_FAST_REDUCTION_CORE_DUO 1
+#endif
+
+enum _reduction_method {
+  reduction_method_not_defined = 0,
+  critical_reduce_block = (1 << 8),
+  atomic_reduce_block = (2 << 8),
+  tree_reduce_block = (3 << 8),
+  empty_reduce_block = (4 << 8)
+};
+
+// Description of the packed_reduction_method variable:
+// The packed_reduction_method variable consists of two enum types variables
+// that are packed together into 0-th byte and 1-st byte:
+// 0: (packed_reduction_method & 0x000000FF) is a 'enum barrier_type' value of
+// barrier that will be used in fast reduction: bs_plain_barrier or
+// bs_reduction_barrier
+// 1: (packed_reduction_method & 0x0000FF00) is a reduction method that will
+// be used in fast reduction;
+// Reduction method is of 'enum _reduction_method' type and it's defined the way
+// so that the bits of 0-th byte are empty, so no need to execute a shift
+// instruction while packing/unpacking
+
+#if KMP_FAST_REDUCTION_BARRIER
+#define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
+  ((reduction_method) | (barrier_type))
+
+#define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
+  ((enum _reduction_method)((packed_reduction_method) & (0x0000FF00)))
+
+#define UNPACK_REDUCTION_BARRIER(packed_reduction_method)                      \
+  ((enum barrier_type)((packed_reduction_method) & (0x000000FF)))
+#else
+#define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
+  (reduction_method)
+
+#define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
+  (packed_reduction_method)
+
+#define UNPACK_REDUCTION_BARRIER(packed_reduction_method) (bs_plain_barrier)
+#endif
+
+#define TEST_REDUCTION_METHOD(packed_reduction_method, which_reduction_block)  \
+  ((UNPACK_REDUCTION_METHOD(packed_reduction_method)) ==                       \
+   (which_reduction_block))
+
+#if KMP_FAST_REDUCTION_BARRIER
+#define TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER                               \
+  (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_reduction_barrier))
+
+#define TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER                                   \
+  (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_plain_barrier))
+#endif
+
+typedef int PACKED_REDUCTION_METHOD_T;
+
+/* -- end of fast reduction stuff ----------------------------------------- */
+
+#if KMP_OS_WINDOWS
+#define USE_CBLKDATA
+#if KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable : 271 310)
+#endif
+#include <windows.h>
+#if KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif
+#endif
+
+#if KMP_OS_UNIX
+#include <dlfcn.h>
+#include <pthread.h>
+#endif
+
+/* Only Linux* OS and Windows* OS support thread affinity. */
+#if KMP_AFFINITY_SUPPORTED
+
+// GROUP_AFFINITY is already defined for _MSC_VER>=1600 (VS2010 and later).
+#if KMP_OS_WINDOWS
+#if _MSC_VER < 1600 && KMP_MSVC_COMPAT
+typedef struct GROUP_AFFINITY {
+  KAFFINITY Mask;
+  WORD Group;
+  WORD Reserved[3];
+} GROUP_AFFINITY;
+#endif /* _MSC_VER < 1600 */
+#if KMP_GROUP_AFFINITY
+extern int __kmp_num_proc_groups;
+#else
+static const int __kmp_num_proc_groups = 1;
+#endif /* KMP_GROUP_AFFINITY */
+typedef DWORD (*kmp_GetActiveProcessorCount_t)(WORD);
+extern kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount;
+
+typedef WORD (*kmp_GetActiveProcessorGroupCount_t)(void);
+extern kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount;
+
+typedef BOOL (*kmp_GetThreadGroupAffinity_t)(HANDLE, GROUP_AFFINITY *);
+extern kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity;
+
+typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
+                                             GROUP_AFFINITY *);
+extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_USE_HWLOC
+extern hwloc_topology_t __kmp_hwloc_topology;
+extern int __kmp_hwloc_error;
+extern int __kmp_numa_detected;
+extern int __kmp_tile_depth;
+#endif
+
+extern size_t __kmp_affin_mask_size;
+#define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
+#define KMP_AFFINITY_DISABLE() (__kmp_affin_mask_size = 0)
+#define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
+#define KMP_CPU_SET_ITERATE(i, mask)                                           \
+  for (i = (mask)->begin(); (int)i != (mask)->end(); i = (mask)->next(i))
+#define KMP_CPU_SET(i, mask) (mask)->set(i)
+#define KMP_CPU_ISSET(i, mask) (mask)->is_set(i)
+#define KMP_CPU_CLR(i, mask) (mask)->clear(i)
+#define KMP_CPU_ZERO(mask) (mask)->zero()
+#define KMP_CPU_COPY(dest, src) (dest)->copy(src)
+#define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src)
+#define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not()
+#define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src)
+#define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask())
+#define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr)
+#define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
+#define KMP_CPU_FREE_FROM_STACK(ptr) KMP_CPU_FREE(ptr)
+#define KMP_CPU_INTERNAL_ALLOC(ptr) KMP_CPU_ALLOC(ptr)
+#define KMP_CPU_INTERNAL_FREE(ptr) KMP_CPU_FREE(ptr)
+#define KMP_CPU_INDEX(arr, i) __kmp_affinity_dispatch->index_mask_array(arr, i)
+#define KMP_CPU_ALLOC_ARRAY(arr, n)                                            \
+  (arr = __kmp_affinity_dispatch->allocate_mask_array(n))
+#define KMP_CPU_FREE_ARRAY(arr, n)                                             \
+  __kmp_affinity_dispatch->deallocate_mask_array(arr)
+#define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) KMP_CPU_ALLOC_ARRAY(arr, n)
+#define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) KMP_CPU_FREE_ARRAY(arr, n)
+#define __kmp_get_system_affinity(mask, abort_bool)                            \
+  (mask)->get_system_affinity(abort_bool)
+#define __kmp_set_system_affinity(mask, abort_bool)                            \
+  (mask)->set_system_affinity(abort_bool)
+#define __kmp_get_proc_group(mask) (mask)->get_proc_group()
+
+class KMPAffinity {
+public:
+  class Mask {
+  public:
+    void *operator new(size_t n);
+    void operator delete(void *p);
+    void *operator new[](size_t n);
+    void operator delete[](void *p);
+    virtual ~Mask() {}
+    // Set bit i to 1
+    virtual void set(int i) {}
+    // Return bit i
+    virtual bool is_set(int i) const { return false; }
+    // Set bit i to 0
+    virtual void clear(int i) {}
+    // Zero out entire mask
+    virtual void zero() {}
+    // Copy src into this mask
+    virtual void copy(const Mask *src) {}
+    // this &= rhs
+    virtual void bitwise_and(const Mask *rhs) {}
+    // this |= rhs
+    virtual void bitwise_or(const Mask *rhs) {}
+    // this = ~this
+    virtual void bitwise_not() {}
+    // API for iterating over an affinity mask
+    // for (int i = mask->begin(); i != mask->end(); i = mask->next(i))
+    virtual int begin() const { return 0; }
+    virtual int end() const { return 0; }
+    virtual int next(int previous) const { return 0; }
+    // Set the system's affinity to this affinity mask's value
+    virtual int set_system_affinity(bool abort_on_error) const { return -1; }
+    // Set this affinity mask to the current system affinity
+    virtual int get_system_affinity(bool abort_on_error) { return -1; }
+    // Only 1 DWORD in the mask should have any procs set.
+    // Return the appropriate index, or -1 for an invalid mask.
+    virtual int get_proc_group() const { return -1; }
+  };
+  void *operator new(size_t n);
+  void operator delete(void *p);
+  // Need virtual destructor
+  virtual ~KMPAffinity() = default;
+  // Determine if affinity is capable
+  virtual void determine_capable(const char *env_var) {}
+  // Bind the current thread to os proc
+  virtual void bind_thread(int proc) {}
+  // Factory functions to allocate/deallocate a mask
+  virtual Mask *allocate_mask() { return nullptr; }
+  virtual void deallocate_mask(Mask *m) {}
+  virtual Mask *allocate_mask_array(int num) { return nullptr; }
+  virtual void deallocate_mask_array(Mask *m) {}
+  virtual Mask *index_mask_array(Mask *m, int index) { return nullptr; }
+  static void pick_api();
+  static void destroy_api();
+  enum api_type {
+    NATIVE_OS
+#if KMP_USE_HWLOC
+    ,
+    HWLOC
+#endif
+  };
+  virtual api_type get_api_type() const {
+    KMP_ASSERT(0);
+    return NATIVE_OS;
+  }
+
+private:
+  static bool picked_api;
+};
+
+typedef KMPAffinity::Mask kmp_affin_mask_t;
+extern KMPAffinity *__kmp_affinity_dispatch;
+
+// Declare local char buffers with this size for printing debug and info
+// messages, using __kmp_affinity_print_mask().
+#define KMP_AFFIN_MASK_PRINT_LEN 1024
+
+enum affinity_type {
+  affinity_none = 0,
+  affinity_physical,
+  affinity_logical,
+  affinity_compact,
+  affinity_scatter,
+  affinity_explicit,
+  affinity_balanced,
+  affinity_disabled, // not used outsize the env var parser
+  affinity_default
+};
+
+enum affinity_gran {
+  affinity_gran_fine = 0,
+  affinity_gran_thread,
+  affinity_gran_core,
+  affinity_gran_tile,
+  affinity_gran_numa,
+  affinity_gran_package,
+  affinity_gran_node,
+#if KMP_GROUP_AFFINITY
+  // The "group" granularity isn't necesssarily coarser than all of the
+  // other levels, but we put it last in the enum.
+  affinity_gran_group,
+#endif /* KMP_GROUP_AFFINITY */
+  affinity_gran_default
+};
+
+enum affinity_top_method {
+  affinity_top_method_all = 0, // try all (supported) methods, in order
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  affinity_top_method_apicid,
+  affinity_top_method_x2apicid,
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+  affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
+#if KMP_GROUP_AFFINITY
+  affinity_top_method_group,
+#endif /* KMP_GROUP_AFFINITY */
+  affinity_top_method_flat,
+#if KMP_USE_HWLOC
+  affinity_top_method_hwloc,
+#endif
+  affinity_top_method_default
+};
+
+#define affinity_respect_mask_default (-1)
+
+extern enum affinity_type __kmp_affinity_type; /* Affinity type */
+extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
+extern int __kmp_affinity_gran_levels; /* corresponding int value */
+extern int __kmp_affinity_dups; /* Affinity duplicate masks */
+extern enum affinity_top_method __kmp_affinity_top_method;
+extern int __kmp_affinity_compact; /* Affinity 'compact' value */
+extern int __kmp_affinity_offset; /* Affinity offset value  */
+extern int __kmp_affinity_verbose; /* Was verbose specified for KMP_AFFINITY? */
+extern int __kmp_affinity_warnings; /* KMP_AFFINITY warnings enabled ? */
+extern int __kmp_affinity_respect_mask; // Respect process' init affinity mask?
+extern char *__kmp_affinity_proclist; /* proc ID list */
+extern kmp_affin_mask_t *__kmp_affinity_masks;
+extern unsigned __kmp_affinity_num_masks;
+extern void __kmp_affinity_bind_thread(int which);
+
+extern kmp_affin_mask_t *__kmp_affin_fullMask;
+extern char *__kmp_cpuinfo_file;
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+// This needs to be kept in sync with the values in omp.h !!!
+typedef enum kmp_proc_bind_t {
+  proc_bind_false = 0,
+  proc_bind_true,
+  proc_bind_master,
+  proc_bind_close,
+  proc_bind_spread,
+  proc_bind_intel, // use KMP_AFFINITY interface
+  proc_bind_default
+} kmp_proc_bind_t;
+
+typedef struct kmp_nested_proc_bind_t {
+  kmp_proc_bind_t *bind_types;
+  int size;
+  int used;
+} kmp_nested_proc_bind_t;
+
+extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
+
+extern int __kmp_display_affinity;
+extern char *__kmp_affinity_format;
+static const size_t KMP_AFFINITY_FORMAT_SIZE = 512;
+
+#if KMP_AFFINITY_SUPPORTED
+#define KMP_PLACE_ALL (-1)
+#define KMP_PLACE_UNDEFINED (-2)
+// Is KMP_AFFINITY is being used instead of OMP_PROC_BIND/OMP_PLACES?
+#define KMP_AFFINITY_NON_PROC_BIND                                             \
+  ((__kmp_nested_proc_bind.bind_types[0] == proc_bind_false ||                 \
+    __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) &&                \
+   (__kmp_affinity_num_masks > 0 || __kmp_affinity_type == affinity_balanced))
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+extern int __kmp_affinity_num_places;
+
+typedef enum kmp_cancel_kind_t {
+  cancel_noreq = 0,
+  cancel_parallel = 1,
+  cancel_loop = 2,
+  cancel_sections = 3,
+  cancel_taskgroup = 4
+} kmp_cancel_kind_t;
+
+// KMP_HW_SUBSET support:
+typedef struct kmp_hws_item {
+  int num;
+  int offset;
+} kmp_hws_item_t;
+
+extern kmp_hws_item_t __kmp_hws_socket;
+extern kmp_hws_item_t __kmp_hws_node;
+extern kmp_hws_item_t __kmp_hws_tile;
+extern kmp_hws_item_t __kmp_hws_core;
+extern kmp_hws_item_t __kmp_hws_proc;
+extern int __kmp_hws_requested;
+extern int __kmp_hws_abs_flag; // absolute or per-item number requested
+
+/* ------------------------------------------------------------------------ */
+
+#define KMP_PAD(type, sz)                                                      \
+  (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
+
+// We need to avoid using -1 as a GTID as +1 is added to the gtid
+// when storing it in a lock, and the value 0 is reserved.
+#define KMP_GTID_DNE (-2) /* Does not exist */
+#define KMP_GTID_SHUTDOWN (-3) /* Library is shutting down */
+#define KMP_GTID_MONITOR (-4) /* Monitor thread ID */
+#define KMP_GTID_UNKNOWN (-5) /* Is not known */
+#define KMP_GTID_MIN (-6) /* Minimal gtid for low bound check in DEBUG */
+
+/* OpenMP 5.0 Memory Management support */
+
+#ifndef __OMP_H
+// Duplicate type definitios from omp.h
+typedef uintptr_t omp_uintptr_t;
+
+typedef enum {
+  OMP_ATK_THREADMODEL = 1,
+  OMP_ATK_ALIGNMENT = 2,
+  OMP_ATK_ACCESS = 3,
+  OMP_ATK_POOL_SIZE = 4,
+  OMP_ATK_FALLBACK = 5,
+  OMP_ATK_FB_DATA = 6,
+  OMP_ATK_PINNED = 7,
+  OMP_ATK_PARTITION = 8
+} omp_alloctrait_key_t;
+
+typedef enum {
+  OMP_ATV_FALSE = 0,
+  OMP_ATV_TRUE = 1,
+  OMP_ATV_DEFAULT = 2,
+  OMP_ATV_CONTENDED = 3,
+  OMP_ATV_UNCONTENDED = 4,
+  OMP_ATV_SEQUENTIAL = 5,
+  OMP_ATV_PRIVATE = 6,
+  OMP_ATV_ALL = 7,
+  OMP_ATV_THREAD = 8,
+  OMP_ATV_PTEAM = 9,
+  OMP_ATV_CGROUP = 10,
+  OMP_ATV_DEFAULT_MEM_FB = 11,
+  OMP_ATV_NULL_FB = 12,
+  OMP_ATV_ABORT_FB = 13,
+  OMP_ATV_ALLOCATOR_FB = 14,
+  OMP_ATV_ENVIRONMENT = 15,
+  OMP_ATV_NEAREST = 16,
+  OMP_ATV_BLOCKED = 17,
+  OMP_ATV_INTERLEAVED = 18
+} omp_alloctrait_value_t;
+
+typedef void *omp_memspace_handle_t;
+extern omp_memspace_handle_t const omp_default_mem_space;
+extern omp_memspace_handle_t const omp_large_cap_mem_space;
+extern omp_memspace_handle_t const omp_const_mem_space;
+extern omp_memspace_handle_t const omp_high_bw_mem_space;
+extern omp_memspace_handle_t const omp_low_lat_mem_space;
+
+typedef struct {
+  omp_alloctrait_key_t key;
+  omp_uintptr_t value;
+} omp_alloctrait_t;
+
+typedef void *omp_allocator_handle_t;
+extern omp_allocator_handle_t const omp_null_allocator;
+extern omp_allocator_handle_t const omp_default_mem_alloc;
+extern omp_allocator_handle_t const omp_large_cap_mem_alloc;
+extern omp_allocator_handle_t const omp_const_mem_alloc;
+extern omp_allocator_handle_t const omp_high_bw_mem_alloc;
+extern omp_allocator_handle_t const omp_low_lat_mem_alloc;
+extern omp_allocator_handle_t const omp_cgroup_mem_alloc;
+extern omp_allocator_handle_t const omp_pteam_mem_alloc;
+extern omp_allocator_handle_t const omp_thread_mem_alloc;
+extern omp_allocator_handle_t const kmp_max_mem_alloc;
+extern omp_allocator_handle_t __kmp_def_allocator;
+
+// end of duplicate type definitios from omp.h
+#endif
+
+extern int __kmp_memkind_available;
+
+typedef omp_memspace_handle_t kmp_memspace_t; // placeholder
+
+typedef struct kmp_allocator_t {
+  omp_memspace_handle_t memspace;
+  void **memkind; // pointer to memkind
+  int alignment;
+  omp_alloctrait_value_t fb;
+  kmp_allocator_t *fb_data;
+  kmp_uint64 pool_size;
+  kmp_uint64 pool_used;
+} kmp_allocator_t;
+
+extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
+                                                    omp_memspace_handle_t,
+                                                    int ntraits,
+                                                    omp_alloctrait_t traits[]);
+extern void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t al);
+extern void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t al);
+extern omp_allocator_handle_t __kmpc_get_default_allocator(int gtid);
+extern void *__kmpc_alloc(int gtid, size_t sz, omp_allocator_handle_t al);
+extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
+
+extern void __kmp_init_memkind();
+extern void __kmp_fini_memkind();
+
+/* ------------------------------------------------------------------------ */
+
+#define KMP_UINT64_MAX                                                         \
+  (~((kmp_uint64)1 << ((sizeof(kmp_uint64) * (1 << 3)) - 1)))
+
+#define KMP_MIN_NTH 1
+
+#ifndef KMP_MAX_NTH
+#if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
+#define KMP_MAX_NTH PTHREAD_THREADS_MAX
+#else
+#define KMP_MAX_NTH INT_MAX
+#endif
+#endif /* KMP_MAX_NTH */
+
+#ifdef PTHREAD_STACK_MIN
+#define KMP_MIN_STKSIZE PTHREAD_STACK_MIN
+#else
+#define KMP_MIN_STKSIZE ((size_t)(32 * 1024))
+#endif
+
+#define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
+
+#if KMP_ARCH_X86
+#define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024))
+#elif KMP_ARCH_X86_64
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
+#define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024))
+#else
+#define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024))
+#endif
+
+#define KMP_DEFAULT_MALLOC_POOL_INCR ((size_t)(1024 * 1024))
+#define KMP_MIN_MALLOC_POOL_INCR ((size_t)(4 * 1024))
+#define KMP_MAX_MALLOC_POOL_INCR                                               \
+  (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
+
+#define KMP_MIN_STKOFFSET (0)
+#define KMP_MAX_STKOFFSET KMP_MAX_STKSIZE
+#if KMP_OS_DARWIN
+#define KMP_DEFAULT_STKOFFSET KMP_MIN_STKOFFSET
+#else
+#define KMP_DEFAULT_STKOFFSET CACHE_LINE
+#endif
+
+#define KMP_MIN_STKPADDING (0)
+#define KMP_MAX_STKPADDING (2 * 1024 * 1024)
+
+#define KMP_BLOCKTIME_MULTIPLIER                                               \
+  (1000) /* number of blocktime units per second */
+#define KMP_MIN_BLOCKTIME (0)
+#define KMP_MAX_BLOCKTIME                                                      \
+  (INT_MAX) /* Must be this for "infinite" setting the work */
+#define KMP_DEFAULT_BLOCKTIME (200) /*  __kmp_blocktime is in milliseconds  */
+
+#if KMP_USE_MONITOR
+#define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024))
+#define KMP_MIN_MONITOR_WAKEUPS (1) // min times monitor wakes up per second
+#define KMP_MAX_MONITOR_WAKEUPS (1000) // max times monitor can wake up per sec
+
+/* Calculate new number of monitor wakeups for a specific block time based on
+   previous monitor_wakeups. Only allow increasing number of wakeups */
+#define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups)                 \
+  (((blocktime) == KMP_MAX_BLOCKTIME)                                          \
+       ? (monitor_wakeups)                                                     \
+       : ((blocktime) == KMP_MIN_BLOCKTIME)                                    \
+             ? KMP_MAX_MONITOR_WAKEUPS                                         \
+             : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))  \
+                   ? (monitor_wakeups)                                         \
+                   : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
+
+/* Calculate number of intervals for a specific block time based on
+   monitor_wakeups */
+#define KMP_INTERVALS_FROM_BLOCKTIME(blocktime, monitor_wakeups)               \
+  (((blocktime) + (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) - 1) /        \
+   (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)))
+#else
+#define KMP_BLOCKTIME(team, tid)                                               \
+  (get__bt_set(team, tid) ? get__blocktime(team, tid) : __kmp_dflt_blocktime)
+#if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+// HW TSC is used to reduce overhead (clock tick instead of nanosecond).
+extern kmp_uint64 __kmp_ticks_per_msec;
+#if KMP_COMPILER_ICC
+#define KMP_NOW() ((kmp_uint64)_rdtsc())
+#else
+#define KMP_NOW() __kmp_hardware_timestamp()
+#endif
+#define KMP_NOW_MSEC() (KMP_NOW() / __kmp_ticks_per_msec)
+#define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
+  (KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_msec)
+#define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW())
+#else
+// System time is retrieved sporadically while blocking.
+extern kmp_uint64 __kmp_now_nsec();
+#define KMP_NOW() __kmp_now_nsec()
+#define KMP_NOW_MSEC() (KMP_NOW() / KMP_USEC_PER_SEC)
+#define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
+  (KMP_BLOCKTIME(team, tid) * KMP_USEC_PER_SEC)
+#define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
+#endif
+#endif // KMP_USE_MONITOR
+
+#define KMP_MIN_STATSCOLS 40
+#define KMP_MAX_STATSCOLS 4096
+#define KMP_DEFAULT_STATSCOLS 80
+
+#define KMP_MIN_INTERVAL 0
+#define KMP_MAX_INTERVAL (INT_MAX - 1)
+#define KMP_DEFAULT_INTERVAL 0
+
+#define KMP_MIN_CHUNK 1
+#define KMP_MAX_CHUNK (INT_MAX - 1)
+#define KMP_DEFAULT_CHUNK 1
+
+#define KMP_DFLT_DISP_NUM_BUFF 7
+#define KMP_MAX_ORDERED 8
+
+#define KMP_MAX_FIELDS 32
+
+#define KMP_MAX_BRANCH_BITS 31
+
+#define KMP_MAX_ACTIVE_LEVELS_LIMIT INT_MAX
+
+#define KMP_MAX_DEFAULT_DEVICE_LIMIT INT_MAX
+
+#define KMP_MAX_TASK_PRIORITY_LIMIT INT_MAX
+
+/* Minimum number of threads before switch to TLS gtid (experimentally
+   determined) */
+/* josh TODO: what about OS X* tuning? */
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_TLS_GTID_MIN 5
+#else
+#define KMP_TLS_GTID_MIN INT_MAX
+#endif
+
+#define KMP_MASTER_TID(tid) ((tid) == 0)
+#define KMP_WORKER_TID(tid) ((tid) != 0)
+
+#define KMP_MASTER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) == 0)
+#define KMP_WORKER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) != 0)
+#define KMP_INITIAL_GTID(gtid) ((gtid) == 0)
+
+#ifndef TRUE
+#define FALSE 0
+#define TRUE (!FALSE)
+#endif
+
+/* NOTE: all of the following constants must be even */
+
+#if KMP_OS_WINDOWS
+#define KMP_INIT_WAIT 64U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */
+#elif KMP_OS_CNK
+#define KMP_INIT_WAIT 16U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 8U /* susequent number of spin-tests */
+#elif KMP_OS_LINUX
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_DARWIN
+/* TODO: tune for KMP_OS_DARWIN */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_DRAGONFLY
+/* TODO: tune for KMP_OS_DRAGONFLY */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_FREEBSD
+/* TODO: tune for KMP_OS_FREEBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_NETBSD
+/* TODO: tune for KMP_OS_NETBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_HURD
+/* TODO: tune for KMP_OS_HURD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_OPENBSD
+/* TODO: tune for KMP_OS_OPENBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+typedef struct kmp_cpuid {
+  kmp_uint32 eax;
+  kmp_uint32 ebx;
+  kmp_uint32 ecx;
+  kmp_uint32 edx;
+} kmp_cpuid_t;
+
+typedef struct kmp_cpuinfo {
+  int initialized; // If 0, other fields are not initialized.
+  int signature; // CPUID(1).EAX
+  int family; // CPUID(1).EAX[27:20]+CPUID(1).EAX[11:8] (Extended Family+Family)
+  int model; // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended
+  // Model << 4 ) + Model)
+  int stepping; // CPUID(1).EAX[3:0] ( Stepping )
+  int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise.
+  int rtm; // 0 if RTM instructions are not supported, 1 otherwise.
+  int cpu_stackoffset;
+  int apic_id;
+  int physical_id;
+  int logical_id;
+  kmp_uint64 frequency; // Nominal CPU frequency in Hz.
+  char name[3 * sizeof(kmp_cpuid_t)]; // CPUID(0x80000002,0x80000003,0x80000004)
+} kmp_cpuinfo_t;
+
+extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
+
+#if KMP_OS_UNIX
+// subleaf is only needed for cache and topology discovery and can be set to
+// zero in most cases
+static inline void __kmp_x86_cpuid(int leaf, int subleaf, struct kmp_cpuid *p) {
+  __asm__ __volatile__("cpuid"
+                       : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
+                       : "a"(leaf), "c"(subleaf));
+}
+// Load p into FPU control word
+static inline void __kmp_load_x87_fpu_control_word(const kmp_int16 *p) {
+  __asm__ __volatile__("fldcw %0" : : "m"(*p));
+}
+// Store FPU control word into p
+static inline void __kmp_store_x87_fpu_control_word(kmp_int16 *p) {
+  __asm__ __volatile__("fstcw %0" : "=m"(*p));
+}
+static inline void __kmp_clear_x87_fpu_status_word() {
+#if KMP_MIC
+  // 32-bit protected mode x87 FPU state
+  struct x87_fpu_state {
+    unsigned cw;
+    unsigned sw;
+    unsigned tw;
+    unsigned fip;
+    unsigned fips;
+    unsigned fdp;
+    unsigned fds;
+  };
+  struct x87_fpu_state fpu_state = {0, 0, 0, 0, 0, 0, 0};
+  __asm__ __volatile__("fstenv %0\n\t" // store FP env
+                       "andw $0x7f00, %1\n\t" // clear 0-7,15 bits of FP SW
+                       "fldenv %0\n\t" // load FP env back
+                       : "+m"(fpu_state), "+m"(fpu_state.sw));
+#else
+  __asm__ __volatile__("fnclex");
+#endif // KMP_MIC
+}
+#if __SSE__
+static inline void __kmp_load_mxcsr(const kmp_uint32 *p) { _mm_setcsr(*p); }
+static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
+#else
+static inline void __kmp_load_mxcsr(const kmp_uint32 *p) {}
+static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = 0; }
+#endif
+#else
+// Windows still has these as external functions in assembly file
+extern void __kmp_x86_cpuid(int mode, int mode2, struct kmp_cpuid *p);
+extern void __kmp_load_x87_fpu_control_word(const kmp_int16 *p);
+extern void __kmp_store_x87_fpu_control_word(kmp_int16 *p);
+extern void __kmp_clear_x87_fpu_status_word();
+static inline void __kmp_load_mxcsr(const kmp_uint32 *p) { _mm_setcsr(*p); }
+static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
+#endif // KMP_OS_UNIX
+
+#define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */
+
+#if KMP_ARCH_X86
+extern void __kmp_x86_pause(void);
+#elif KMP_MIC
+// Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed
+// regression after removal of extra PAUSE from spin loops. Changing
+// the delay from 100 to 300 showed even better performance than double PAUSE
+// on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
+static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
+#else
+static inline void __kmp_x86_pause(void) { _mm_pause(); }
+#endif
+#define KMP_CPU_PAUSE() __kmp_x86_pause()
+#elif KMP_ARCH_PPC64
+#define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1")
+#define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2")
+#define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory")
+#define KMP_CPU_PAUSE()                                                        \
+  do {                                                                         \
+    KMP_PPC64_PRI_LOW();                                                       \
+    KMP_PPC64_PRI_MED();                                                       \
+    KMP_PPC64_PRI_LOC_MB();                                                    \
+  } while (0)
+#else
+#define KMP_CPU_PAUSE() /* nothing to do */
+#endif
+
+#define KMP_INIT_YIELD(count)                                                  \
+  { (count) = __kmp_yield_init; }
+
+#define KMP_OVERSUBSCRIBED                                                     \
+  (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
+
+#define KMP_TRY_YIELD                                                          \
+  ((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED)))
+
+#define KMP_TRY_YIELD_OVERSUB                                                  \
+  ((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED))
+
+#define KMP_YIELD(cond)                                                        \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if ((cond) && (KMP_TRY_YIELD))                                             \
+      __kmp_yield();                                                           \
+  }
+
+#define KMP_YIELD_OVERSUB()                                                    \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if ((KMP_TRY_YIELD_OVERSUB))                                               \
+      __kmp_yield();                                                           \
+  }
+
+// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
+// there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
+#define KMP_YIELD_SPIN(count)                                                  \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if (KMP_TRY_YIELD) {                                                       \
+      (count) -= 2;                                                            \
+      if (!(count)) {                                                          \
+        __kmp_yield();                                                         \
+        (count) = __kmp_yield_next;                                            \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define KMP_YIELD_OVERSUB_ELSE_SPIN(count)                                     \
+  {                                                                            \
+    KMP_CPU_PAUSE();                                                           \
+    if ((KMP_TRY_YIELD_OVERSUB))                                               \
+      __kmp_yield();                                                           \
+    else if (__kmp_use_yield == 1) {                                           \
+      (count) -= 2;                                                            \
+      if (!(count)) {                                                          \
+        __kmp_yield();                                                         \
+        (count) = __kmp_yield_next;                                            \
+      }                                                                        \
+    }                                                                          \
+  }
+
+/* ------------------------------------------------------------------------ */
+/* Support datatypes for the orphaned construct nesting checks.             */
+/* ------------------------------------------------------------------------ */
+
+enum cons_type {
+  ct_none,
+  ct_parallel,
+  ct_pdo,
+  ct_pdo_ordered,
+  ct_psections,
+  ct_psingle,
+  ct_critical,
+  ct_ordered_in_parallel,
+  ct_ordered_in_pdo,
+  ct_master,
+  ct_reduce,
+  ct_barrier
+};
+
+#define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered)
+
+struct cons_data {
+  ident_t const *ident;
+  enum cons_type type;
+  int prev;
+  kmp_user_lock_p
+      name; /* address exclusively for critical section name comparison */
+};
+
+struct cons_header {
+  int p_top, w_top, s_top;
+  int stack_size, stack_top;
+  struct cons_data *stack_data;
+};
+
+struct kmp_region_info {
+  char *text;
+  int offset[KMP_MAX_FIELDS];
+  int length[KMP_MAX_FIELDS];
+};
+
+/* ---------------------------------------------------------------------- */
+/* ---------------------------------------------------------------------- */
+
+#if KMP_OS_WINDOWS
+typedef HANDLE kmp_thread_t;
+typedef DWORD kmp_key_t;
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_OS_UNIX
+typedef pthread_t kmp_thread_t;
+typedef pthread_key_t kmp_key_t;
+#endif
+
+extern kmp_key_t __kmp_gtid_threadprivate_key;
+
+typedef struct kmp_sys_info {
+  long maxrss; /* the maximum resident set size utilized (in kilobytes)     */
+  long minflt; /* the number of page faults serviced without any I/O        */
+  long majflt; /* the number of page faults serviced that required I/O      */
+  long nswap; /* the number of times a process was "swapped" out of memory */
+  long inblock; /* the number of times the file system had to perform input  */
+  long oublock; /* the number of times the file system had to perform output */
+  long nvcsw; /* the number of times a context switch was voluntarily      */
+  long nivcsw; /* the number of times a context switch was forced           */
+} kmp_sys_info_t;
+
+#if USE_ITT_BUILD
+// We cannot include "kmp_itt.h" due to circular dependency. Declare the only
+// required type here. Later we will check the type meets requirements.
+typedef int kmp_itt_mark_t;
+#define KMP_ITT_DEBUG 0
+#endif /* USE_ITT_BUILD */
+
+typedef kmp_int32 kmp_critical_name[8];
+
+/*!
+@ingroup PARALLEL
+The type for a microtask which gets passed to @ref __kmpc_fork_call().
+The arguments to the outlined function are
+@param global_tid the global thread identity of the thread executing the
+function.
+@param bound_tid  the local identitiy of the thread executing the function
+@param ... pointers to shared variables accessed by the function.
+*/
+typedef void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...);
+typedef void (*kmpc_micro_bound)(kmp_int32 *bound_tid, kmp_int32 *bound_nth,
+                                 ...);
+
+/*!
+@ingroup THREADPRIVATE
+@{
+*/
+/* ---------------------------------------------------------------------------
+ */
+/* Threadprivate initialization/finalization function declarations */
+
+/*  for non-array objects:  __kmpc_threadprivate_register()  */
+
+/*!
+ Pointer to the constructor function.
+ The first argument is the <tt>this</tt> pointer
+*/
+typedef void *(*kmpc_ctor)(void *);
+
+/*!
+ Pointer to the destructor function.
+ The first argument is the <tt>this</tt> pointer
+*/
+typedef void (*kmpc_dtor)(
+    void * /*, size_t */); /* 2nd arg: magic number for KCC unused by Intel
+                              compiler */
+/*!
+ Pointer to an alternate constructor.
+ The first argument is the <tt>this</tt> pointer.
+*/
+typedef void *(*kmpc_cctor)(void *, void *);
+
+/* for array objects: __kmpc_threadprivate_register_vec() */
+/* First arg: "this" pointer */
+/* Last arg: number of array elements */
+/*!
+ Array constructor.
+ First argument is the <tt>this</tt> pointer
+ Second argument the number of array elements.
+*/
+typedef void *(*kmpc_ctor_vec)(void *, size_t);
+/*!
+ Pointer to the array destructor function.
+ The first argument is the <tt>this</tt> pointer
+ Second argument the number of array elements.
+*/
+typedef void (*kmpc_dtor_vec)(void *, size_t);
+/*!
+ Array constructor.
+ First argument is the <tt>this</tt> pointer
+ Third argument the number of array elements.
+*/
+typedef void *(*kmpc_cctor_vec)(void *, void *,
+                                size_t); /* function unused by compiler */
+
+/*!
+@}
+*/
+
+/* keeps tracked of threadprivate cache allocations for cleanup later */
+typedef struct kmp_cached_addr {
+  void **addr; /* address of allocated cache */
+  void ***compiler_cache; /* pointer to compiler's cache */
+  void *data; /* pointer to global data */
+  struct kmp_cached_addr *next; /* pointer to next cached address */
+} kmp_cached_addr_t;
+
+struct private_data {
+  struct private_data *next; /* The next descriptor in the list      */
+  void *data; /* The data buffer for this descriptor  */
+  int more; /* The repeat count for this descriptor */
+  size_t size; /* The data size for this descriptor    */
+};
+
+struct private_common {
+  struct private_common *next;
+  struct private_common *link;
+  void *gbl_addr;
+  void *par_addr; /* par_addr == gbl_addr for MASTER thread */
+  size_t cmn_size;
+};
+
+struct shared_common {
+  struct shared_common *next;
+  struct private_data *pod_init;
+  void *obj_init;
+  void *gbl_addr;
+  union {
+    kmpc_ctor ctor;
+    kmpc_ctor_vec ctorv;
+  } ct;
+  union {
+    kmpc_cctor cctor;
+    kmpc_cctor_vec cctorv;
+  } cct;
+  union {
+    kmpc_dtor dtor;
+    kmpc_dtor_vec dtorv;
+  } dt;
+  size_t vec_len;
+  int is_vec;
+  size_t cmn_size;
+};
+
+#define KMP_HASH_TABLE_LOG2 9 /* log2 of the hash table size */
+#define KMP_HASH_TABLE_SIZE                                                    \
+  (1 << KMP_HASH_TABLE_LOG2) /* size of the hash table */
+#define KMP_HASH_SHIFT 3 /* throw away this many low bits from the address */
+#define KMP_HASH(x)                                                            \
+  ((((kmp_uintptr_t)x) >> KMP_HASH_SHIFT) & (KMP_HASH_TABLE_SIZE - 1))
+
+struct common_table {
+  struct private_common *data[KMP_HASH_TABLE_SIZE];
+};
+
+struct shared_table {
+  struct shared_common *data[KMP_HASH_TABLE_SIZE];
+};
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_USE_HIER_SCHED
+// Shared barrier data that exists inside a single unit of the scheduling
+// hierarchy
+typedef struct kmp_hier_private_bdata_t {
+  kmp_int32 num_active;
+  kmp_uint64 index;
+  kmp_uint64 wait_val[2];
+} kmp_hier_private_bdata_t;
+#endif
+
+typedef struct kmp_sched_flags {
+  unsigned ordered : 1;
+  unsigned nomerge : 1;
+  unsigned contains_last : 1;
+#if KMP_USE_HIER_SCHED
+  unsigned use_hier : 1;
+  unsigned unused : 28;
+#else
+  unsigned unused : 29;
+#endif
+} kmp_sched_flags_t;
+
+KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
+
+#if KMP_STATIC_STEAL_ENABLED
+typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
+  kmp_int32 count;
+  kmp_int32 ub;
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  kmp_int32 lb;
+  kmp_int32 st;
+  kmp_int32 tc;
+  kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
+                                     after ub */
+
+  // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+
+  struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
+    kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
+    kmp_int32 parm2; //     make no real change at least while padding is off.
+    kmp_int32 parm3;
+    kmp_int32 parm4;
+  };
+
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
+#if KMP_OS_WINDOWS
+  // This var can be placed in the hole between 'tc' and 'parm1', instead of
+  // 'static_steal_counter'. It would be nice to measure execution times.
+  // Conditional if/endif can be removed at all.
+  kmp_int32 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info32_t;
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
+  kmp_int64 count; // current chunk number for static & static-steal scheduling
+  kmp_int64 ub; /* upper-bound */
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  kmp_int64 lb; /* lower-bound */
+  kmp_int64 st; /* stride */
+  kmp_int64 tc; /* trip count (number of iterations) */
+  kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
+                                     after ub */
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+
+  struct KMP_ALIGN(32) {
+    kmp_int64 parm1;
+    kmp_int64 parm2;
+    kmp_int64 parm3;
+    kmp_int64 parm4;
+  };
+
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
+#if KMP_OS_WINDOWS
+  // This var can be placed in the hole between 'tc' and 'parm1', instead of
+  // 'static_steal_counter'. It would be nice to measure execution times.
+  // Conditional if/endif can be removed at all.
+  kmp_int64 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info64_t;
+#else /* KMP_STATIC_STEAL_ENABLED */
+typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
+  kmp_int32 lb;
+  kmp_int32 ub;
+  kmp_int32 st;
+  kmp_int32 tc;
+
+  kmp_int32 parm1;
+  kmp_int32 parm2;
+  kmp_int32 parm3;
+  kmp_int32 parm4;
+
+  kmp_int32 count;
+
+  kmp_uint32 ordered_lower;
+  kmp_uint32 ordered_upper;
+#if KMP_OS_WINDOWS
+  kmp_int32 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info32_t;
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
+  kmp_int64 lb; /* lower-bound */
+  kmp_int64 ub; /* upper-bound */
+  kmp_int64 st; /* stride */
+  kmp_int64 tc; /* trip count (number of iterations) */
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+  kmp_int64 parm1;
+  kmp_int64 parm2;
+  kmp_int64 parm3;
+  kmp_int64 parm4;
+
+  kmp_int64 count; /* current chunk number for static scheduling */
+
+  kmp_uint64 ordered_lower;
+  kmp_uint64 ordered_upper;
+#if KMP_OS_WINDOWS
+  kmp_int64 last_upper;
+#endif /* KMP_OS_WINDOWS */
+} dispatch_private_info64_t;
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+typedef struct KMP_ALIGN_CACHE dispatch_private_info {
+  union private_info {
+    dispatch_private_info32_t p32;
+    dispatch_private_info64_t p64;
+  } u;
+  enum sched_type schedule; /* scheduling algorithm */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  kmp_int32 ordered_bumped;
+  // To retain the structure size after making ordered_iteration scalar
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
+  // Stack of buffers for nest of serial regions
+  struct dispatch_private_info *next;
+  kmp_int32 type_size; /* the size of types in private_info */
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  void *parent; /* hierarchical scheduling parent pointer */
+#endif
+  enum cons_type pushed_ws;
+} dispatch_private_info_t;
+
+typedef struct dispatch_shared_info32 {
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile kmp_uint32 iteration;
+  volatile kmp_uint32 num_done;
+  volatile kmp_uint32 ordered_iteration;
+  // Dummy to retain the structure size after making ordered_iteration scalar
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1];
+} dispatch_shared_info32_t;
+
+typedef struct dispatch_shared_info64 {
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile kmp_uint64 iteration;
+  volatile kmp_uint64 num_done;
+  volatile kmp_uint64 ordered_iteration;
+  // Dummy to retain the structure size after making ordered_iteration scalar
+  kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3];
+} dispatch_shared_info64_t;
+
+typedef struct dispatch_shared_info {
+  union shared_info {
+    dispatch_shared_info32_t s32;
+    dispatch_shared_info64_t s64;
+  } u;
+  volatile kmp_uint32 buffer_index;
+  volatile kmp_int32 doacross_buf_idx; // teamwise index
+  volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
+  kmp_int32 doacross_num_done; // count finished threads
+#if KMP_USE_HIER_SCHED
+  void *hier;
+#endif
+#if KMP_USE_HWLOC
+  // When linking with libhwloc, the ORDERED EPCC test slows down on big
+  // machines (> 48 cores). Performance analysis showed that a cache thrash
+  // was occurring and this padding helps alleviate the problem.
+  char padding[64];
+#endif
+} dispatch_shared_info_t;
+
+typedef struct kmp_disp {
+  /* Vector for ORDERED SECTION */
+  void (*th_deo_fcn)(int *gtid, int *cid, ident_t *);
+  /* Vector for END ORDERED SECTION */
+  void (*th_dxo_fcn)(int *gtid, int *cid, ident_t *);
+
+  dispatch_shared_info_t *th_dispatch_sh_current;
+  dispatch_private_info_t *th_dispatch_pr_current;
+
+  dispatch_private_info_t *th_disp_buffer;
+  kmp_int32 th_disp_index;
+  kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
+  volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
+  union { // we can use union here because doacross cannot be used in
+    // nonmonotonic loops
+    kmp_int64 *th_doacross_info; // info on loop bounds
+    kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
+  };
+#if KMP_USE_INTERNODE_ALIGNMENT
+  char more_padding[INTERNODE_CACHE_LINE];
+#endif
+} kmp_disp_t;
+
+/* ------------------------------------------------------------------------ */
+/* Barrier stuff */
+
+/* constants for barrier state update */
+#define KMP_INIT_BARRIER_STATE 0 /* should probably start from zero */
+#define KMP_BARRIER_SLEEP_BIT 0 /* bit used for suspend/sleep part of state */
+#define KMP_BARRIER_UNUSED_BIT 1 // bit that must never be set for valid state
+#define KMP_BARRIER_BUMP_BIT 2 /* lsb used for bump of go/arrived state */
+
+#define KMP_BARRIER_SLEEP_STATE (1 << KMP_BARRIER_SLEEP_BIT)
+#define KMP_BARRIER_UNUSED_STATE (1 << KMP_BARRIER_UNUSED_BIT)
+#define KMP_BARRIER_STATE_BUMP (1 << KMP_BARRIER_BUMP_BIT)
+
+#if (KMP_BARRIER_SLEEP_BIT >= KMP_BARRIER_BUMP_BIT)
+#error "Barrier sleep bit must be smaller than barrier bump bit"
+#endif
+#if (KMP_BARRIER_UNUSED_BIT >= KMP_BARRIER_BUMP_BIT)
+#error "Barrier unused bit must be smaller than barrier bump bit"
+#endif
+
+// Constants for release barrier wait state: currently, hierarchical only
+#define KMP_BARRIER_NOT_WAITING 0 // Normal state; worker not in wait_sleep
+#define KMP_BARRIER_OWN_FLAG                                                   \
+  1 // Normal state; worker waiting on own b_go flag in release
+#define KMP_BARRIER_PARENT_FLAG                                                \
+  2 // Special state; worker waiting on parent's b_go flag in release
+#define KMP_BARRIER_SWITCH_TO_OWN_FLAG                                         \
+  3 // Special state; tells worker to shift from parent to own b_go
+#define KMP_BARRIER_SWITCHING                                                  \
+  4 // Special state; worker resets appropriate flag on wake-up
+
+#define KMP_NOT_SAFE_TO_REAP                                                   \
+  0 // Thread th_reap_state: not safe to reap (tasking)
+#define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking)
+
+enum barrier_type {
+  bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction
+                           barriers if enabled) */
+  bs_forkjoin_barrier, /* 1, All fork/join (parallel region) barriers */
+#if KMP_FAST_REDUCTION_BARRIER
+  bs_reduction_barrier, /* 2, All barriers that are used in reduction */
+#endif // KMP_FAST_REDUCTION_BARRIER
+  bs_last_barrier /* Just a placeholder to mark the end */
+};
+
+// to work with reduction barriers just like with plain barriers
+#if !KMP_FAST_REDUCTION_BARRIER
+#define bs_reduction_barrier bs_plain_barrier
+#endif // KMP_FAST_REDUCTION_BARRIER
+
+typedef enum kmp_bar_pat { /* Barrier communication patterns */
+                           bp_linear_bar =
+                               0, /* Single level (degenerate) tree */
+                           bp_tree_bar =
+                               1, /* Balanced tree with branching factor 2^n */
+                           bp_hyper_bar =
+                               2, /* Hypercube-embedded tree with min branching
+                                     factor 2^n */
+                           bp_hierarchical_bar = 3, /* Machine hierarchy tree */
+                           bp_last_bar /* Placeholder to mark the end */
+} kmp_bar_pat_e;
+
+#define KMP_BARRIER_ICV_PUSH 1
+
+/* Record for holding the values of the internal controls stack records */
+typedef struct kmp_internal_control {
+  int serial_nesting_level; /* corresponds to the value of the
+                               th_team_serialized field */
+  kmp_int8 dynamic; /* internal control for dynamic adjustment of threads (per
+                       thread) */
+  kmp_int8
+      bt_set; /* internal control for whether blocktime is explicitly set */
+  int blocktime; /* internal control for blocktime */
+#if KMP_USE_MONITOR
+  int bt_intervals; /* internal control for blocktime intervals */
+#endif
+  int nproc; /* internal control for #threads for next parallel region (per
+                thread) */
+  int thread_limit; /* internal control for thread-limit-var */
+  int max_active_levels; /* internal control for max_active_levels */
+  kmp_r_sched_t
+      sched; /* internal control for runtime schedule {sched,chunk} pair */
+  kmp_proc_bind_t proc_bind; /* internal control for affinity  */
+  kmp_int32 default_device; /* internal control for default device */
+  struct kmp_internal_control *next;
+} kmp_internal_control_t;
+
+static inline void copy_icvs(kmp_internal_control_t *dst,
+                             kmp_internal_control_t *src) {
+  *dst = *src;
+}
+
+/* Thread barrier needs volatile barrier fields */
+typedef struct KMP_ALIGN_CACHE kmp_bstate {
+  // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all
+  // uses of it). It is not explicitly aligned below, because we *don't* want
+  // it to be padded -- instead, we fit b_go into the same cache line with
+  // th_fixed_icvs, enabling NGO cache lines stores in the hierarchical barrier.
+  kmp_internal_control_t th_fixed_icvs; // Initial ICVs for the thread
+  // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with
+  // same NGO store
+  volatile kmp_uint64 b_go; // STATE => task should proceed (hierarchical)
+  KMP_ALIGN_CACHE volatile kmp_uint64
+      b_arrived; // STATE => task reached synch point.
+  kmp_uint32 *skip_per_level;
+  kmp_uint32 my_level;
+  kmp_int32 parent_tid;
+  kmp_int32 old_tid;
+  kmp_uint32 depth;
+  struct kmp_bstate *parent_bar;
+  kmp_team_t *team;
+  kmp_uint64 leaf_state;
+  kmp_uint32 nproc;
+  kmp_uint8 base_leaf_kids;
+  kmp_uint8 leaf_kids;
+  kmp_uint8 offset;
+  kmp_uint8 wait_flag;
+  kmp_uint8 use_oncore_barrier;
+#if USE_DEBUGGER
+  // The following field is intended for the debugger solely. Only the worker
+  // thread itself accesses this field: the worker increases it by 1 when it
+  // arrives to a barrier.
+  KMP_ALIGN_CACHE kmp_uint b_worker_arrived;
+#endif /* USE_DEBUGGER */
+} kmp_bstate_t;
+
+union KMP_ALIGN_CACHE kmp_barrier_union {
+  double b_align; /* use worst case alignment */
+  char b_pad[KMP_PAD(kmp_bstate_t, CACHE_LINE)];
+  kmp_bstate_t bb;
+};
+
+typedef union kmp_barrier_union kmp_balign_t;
+
+/* Team barrier needs only non-volatile arrived counter */
+union KMP_ALIGN_CACHE kmp_barrier_team_union {
+  double b_align; /* use worst case alignment */
+  char b_pad[CACHE_LINE];
+  struct {
+    kmp_uint64 b_arrived; /* STATE => task reached synch point. */
+#if USE_DEBUGGER
+    // The following two fields are indended for the debugger solely. Only
+    // master of the team accesses these fields: the first one is increased by
+    // 1 when master arrives to a barrier, the second one is increased by one
+    // when all the threads arrived.
+    kmp_uint b_master_arrived;
+    kmp_uint b_team_arrived;
+#endif
+  };
+};
+
+typedef union kmp_barrier_team_union kmp_balign_team_t;
+
+/* Padding for Linux* OS pthreads condition variables and mutexes used to signal
+   threads when a condition changes.  This is to workaround an NPTL bug where
+   padding was added to pthread_cond_t which caused the initialization routine
+   to write outside of the structure if compiled on pre-NPTL threads.  */
+#if KMP_OS_WINDOWS
+typedef struct kmp_win32_mutex {
+  /* The Lock */
+  CRITICAL_SECTION cs;
+} kmp_win32_mutex_t;
+
+typedef struct kmp_win32_cond {
+  /* Count of the number of waiters. */
+  int waiters_count_;
+
+  /* Serialize access to <waiters_count_> */
+  kmp_win32_mutex_t waiters_count_lock_;
+
+  /* Number of threads to release via a <cond_broadcast> or a <cond_signal> */
+  int release_count_;
+
+  /* Keeps track of the current "generation" so that we don't allow */
+  /* one thread to steal all the "releases" from the broadcast. */
+  int wait_generation_count_;
+
+  /* A manual-reset event that's used to block and release waiting threads. */
+  HANDLE event_;
+} kmp_win32_cond_t;
+#endif
+
+#if KMP_OS_UNIX
+
+union KMP_ALIGN_CACHE kmp_cond_union {
+  double c_align;
+  char c_pad[CACHE_LINE];
+  pthread_cond_t c_cond;
+};
+
+typedef union kmp_cond_union kmp_cond_align_t;
+
+union KMP_ALIGN_CACHE kmp_mutex_union {
+  double m_align;
+  char m_pad[CACHE_LINE];
+  pthread_mutex_t m_mutex;
+};
+
+typedef union kmp_mutex_union kmp_mutex_align_t;
+
+#endif /* KMP_OS_UNIX */
+
+typedef struct kmp_desc_base {
+  void *ds_stackbase;
+  size_t ds_stacksize;
+  int ds_stackgrow;
+  kmp_thread_t ds_thread;
+  volatile int ds_tid;
+  int ds_gtid;
+#if KMP_OS_WINDOWS
+  volatile int ds_alive;
+  DWORD ds_thread_id;
+/* ds_thread keeps thread handle on Windows* OS. It is enough for RTL purposes.
+   However, debugger support (libomp_db) cannot work with handles, because they
+   uncomparable. For example, debugger requests info about thread with handle h.
+   h is valid within debugger process, and meaningless within debugee process.
+   Even if h is duped by call to DuplicateHandle(), so the result h' is valid
+   within debugee process, but it is a *new* handle which does *not* equal to
+   any other handle in debugee... The only way to compare handles is convert
+   them to system-wide ids. GetThreadId() function is available only in
+   Longhorn and Server 2003. :-( In contrast, GetCurrentThreadId() is available
+   on all Windows* OS flavours (including Windows* 95). Thus, we have to get
+   thread id by call to GetCurrentThreadId() from within the thread and save it
+   to let libomp_db identify threads.  */
+#endif /* KMP_OS_WINDOWS */
+} kmp_desc_base_t;
+
+typedef union KMP_ALIGN_CACHE kmp_desc {
+  double ds_align; /* use worst case alignment */
+  char ds_pad[KMP_PAD(kmp_desc_base_t, CACHE_LINE)];
+  kmp_desc_base_t ds;
+} kmp_desc_t;
+
+typedef struct kmp_local {
+  volatile int this_construct; /* count of single's encountered by thread */
+  void *reduce_data;
+#if KMP_USE_BGET
+  void *bget_data;
+  void *bget_list;
+#if !USE_CMP_XCHG_FOR_BGET
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  kmp_lock_t bget_lock; /* Lock for accessing bget free list */
+#else
+  kmp_bootstrap_lock_t bget_lock; // Lock for accessing bget free list. Must be
+// bootstrap lock so we can use it at library
+// shutdown.
+#endif /* USE_LOCK_FOR_BGET */
+#endif /* ! USE_CMP_XCHG_FOR_BGET */
+#endif /* KMP_USE_BGET */
+
+  PACKED_REDUCTION_METHOD_T
+  packed_reduction_method; /* stored by __kmpc_reduce*(), used by
+                              __kmpc_end_reduce*() */
+
+} kmp_local_t;
+
+#define KMP_CHECK_UPDATE(a, b)                                                 \
+  if ((a) != (b))                                                              \
+  (a) = (b)
+#define KMP_CHECK_UPDATE_SYNC(a, b)                                            \
+  if ((a) != (b))                                                              \
+  TCW_SYNC_PTR((a), (b))
+
+#define get__blocktime(xteam, xtid)                                            \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
+#define get__bt_set(xteam, xtid)                                               \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
+#if KMP_USE_MONITOR
+#define get__bt_intervals(xteam, xtid)                                         \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
+#endif
+
+#define get__dynamic_2(xteam, xtid)                                            \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
+#define get__nproc_2(xteam, xtid)                                              \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
+#define get__sched_2(xteam, xtid)                                              \
+  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
+
+#define set__blocktime_team(xteam, xtid, xval)                                 \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime) =     \
+       (xval))
+
+#if KMP_USE_MONITOR
+#define set__bt_intervals_team(xteam, xtid, xval)                              \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals) =  \
+       (xval))
+#endif
+
+#define set__bt_set_team(xteam, xtid, xval)                                    \
+  (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set) = (xval))
+
+#define set__dynamic(xthread, xval)                                            \
+  (((xthread)->th.th_current_task->td_icvs.dynamic) = (xval))
+#define get__dynamic(xthread)                                                  \
+  (((xthread)->th.th_current_task->td_icvs.dynamic) ? (FTN_TRUE) : (FTN_FALSE))
+
+#define set__nproc(xthread, xval)                                              \
+  (((xthread)->th.th_current_task->td_icvs.nproc) = (xval))
+
+#define set__thread_limit(xthread, xval)                                       \
+  (((xthread)->th.th_current_task->td_icvs.thread_limit) = (xval))
+
+#define set__max_active_levels(xthread, xval)                                  \
+  (((xthread)->th.th_current_task->td_icvs.max_active_levels) = (xval))
+
+#define get__max_active_levels(xthread)                                        \
+  ((xthread)->th.th_current_task->td_icvs.max_active_levels)
+
+#define set__sched(xthread, xval)                                              \
+  (((xthread)->th.th_current_task->td_icvs.sched) = (xval))
+
+#define set__proc_bind(xthread, xval)                                          \
+  (((xthread)->th.th_current_task->td_icvs.proc_bind) = (xval))
+#define get__proc_bind(xthread)                                                \
+  ((xthread)->th.th_current_task->td_icvs.proc_bind)
+
+// OpenMP tasking data structures
+
+typedef enum kmp_tasking_mode {
+  tskm_immediate_exec = 0,
+  tskm_extra_barrier = 1,
+  tskm_task_teams = 2,
+  tskm_max = 2
+} kmp_tasking_mode_t;
+
+extern kmp_tasking_mode_t
+    __kmp_tasking_mode; /* determines how/when to execute tasks */
+extern int __kmp_task_stealing_constraint;
+extern int __kmp_enable_task_throttling;
+extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if
+// specified, defaults to 0 otherwise
+// Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise
+extern kmp_int32 __kmp_max_task_priority;
+// Set via KMP_TASKLOOP_MIN_TASKS if specified, defaults to 0 otherwise
+extern kmp_uint64 __kmp_taskloop_min_tasks;
+
+/* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with
+   taskdata first */
+#define KMP_TASK_TO_TASKDATA(task) (((kmp_taskdata_t *)task) - 1)
+#define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1)
+
+// The tt_found_tasks flag is a signal to all threads in the team that tasks
+// were spawned and queued since the previous barrier release.
+#define KMP_TASKING_ENABLED(task_team)                                         \
+  (TCR_SYNC_4((task_team)->tt.tt_found_tasks) == TRUE)
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+ */
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, void *);
+
+typedef union kmp_cmplrdata {
+  kmp_int32 priority; /**< priority specified by user for the task */
+  kmp_routine_entry_t
+      destructors; /* pointer to function to invoke deconstructors of
+                      firstprivate C++ objects */
+  /* future data */
+} kmp_cmplrdata_t;
+
+/*  sizeof_kmp_task_t passed as arg to kmpc_omp_task call  */
+/*!
+ */
+typedef struct kmp_task { /* GEH: Shouldn't this be aligned somehow? */
+  void *shareds; /**< pointer to block of pointers to shared vars   */
+  kmp_routine_entry_t
+      routine; /**< pointer to routine to call for executing task */
+  kmp_int32 part_id; /**< part id for the task                          */
+  kmp_cmplrdata_t
+      data1; /* Two known optional additions: destructors and priority */
+  kmp_cmplrdata_t data2; /* Process destructors first, priority second */
+  /* future data */
+  /*  private vars  */
+} kmp_task_t;
+
+/*!
+@}
+*/
+
+typedef struct kmp_taskgroup {
+  std::atomic<kmp_int32> count; // number of allocated and incomplete tasks
+  std::atomic<kmp_int32>
+      cancel_request; // request for cancellation of this taskgroup
+  struct kmp_taskgroup *parent; // parent taskgroup
+  // Block of data to perform task reduction
+  void *reduce_data; // reduction related info
+  kmp_int32 reduce_num_data; // number of data items to reduce
+} kmp_taskgroup_t;
+
+// forward declarations
+typedef union kmp_depnode kmp_depnode_t;
+typedef struct kmp_depnode_list kmp_depnode_list_t;
+typedef struct kmp_dephash_entry kmp_dephash_entry_t;
+
+// Compiler sends us this info:
+typedef struct kmp_depend_info {
+  kmp_intptr_t base_addr;
+  size_t len;
+  struct {
+    bool in : 1;
+    bool out : 1;
+    bool mtx : 1;
+  } flags;
+} kmp_depend_info_t;
+
+// Internal structures to work with task dependencies:
+struct kmp_depnode_list {
+  kmp_depnode_t *node;
+  kmp_depnode_list_t *next;
+};
+
+// Max number of mutexinoutset dependencies per node
+#define MAX_MTX_DEPS 4
+
+typedef struct kmp_base_depnode {
+  kmp_depnode_list_t *successors; /* used under lock */
+  kmp_task_t *task; /* non-NULL if depnode is active, used under lock */
+  kmp_lock_t *mtx_locks[MAX_MTX_DEPS]; /* lock mutexinoutset dependent tasks */
+  kmp_int32 mtx_num_locks; /* number of locks in mtx_locks array */
+  kmp_lock_t lock; /* guards shared fields: task, successors */
+#if KMP_SUPPORT_GRAPH_OUTPUT
+  kmp_uint32 id;
+#endif
+  std::atomic<kmp_int32> npredecessors;
+  std::atomic<kmp_int32> nrefs;
+} kmp_base_depnode_t;
+
+union KMP_ALIGN_CACHE kmp_depnode {
+  double dn_align; /* use worst case alignment */
+  char dn_pad[KMP_PAD(kmp_base_depnode_t, CACHE_LINE)];
+  kmp_base_depnode_t dn;
+};
+
+struct kmp_dephash_entry {
+  kmp_intptr_t addr;
+  kmp_depnode_t *last_out;
+  kmp_depnode_list_t *last_ins;
+  kmp_depnode_list_t *last_mtxs;
+  kmp_int32 last_flag;
+  kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */
+  kmp_dephash_entry_t *next_in_bucket;
+};
+
+typedef struct kmp_dephash {
+  kmp_dephash_entry_t **buckets;
+  size_t size;
+#ifdef KMP_DEBUG
+  kmp_uint32 nelements;
+  kmp_uint32 nconflicts;
+#endif
+} kmp_dephash_t;
+
+typedef struct kmp_task_affinity_info {
+  kmp_intptr_t base_addr;
+  size_t len;
+  struct {
+    bool flag1 : 1;
+    bool flag2 : 1;
+    kmp_int32 reserved : 30;
+  } flags;
+} kmp_task_affinity_info_t;
+
+typedef enum kmp_event_type_t {
+  KMP_EVENT_UNINITIALIZED = 0,
+  KMP_EVENT_ALLOW_COMPLETION = 1
+} kmp_event_type_t;
+
+typedef struct {
+  kmp_event_type_t type;
+  kmp_tas_lock_t lock;
+  union {
+    kmp_task_t *task;
+  } ed;
+} kmp_event_t;
+
+#ifdef BUILD_TIED_TASK_STACK
+
+/* Tied Task stack definitions */
+typedef struct kmp_stack_block {
+  kmp_taskdata_t *sb_block[TASK_STACK_BLOCK_SIZE];
+  struct kmp_stack_block *sb_next;
+  struct kmp_stack_block *sb_prev;
+} kmp_stack_block_t;
+
+typedef struct kmp_task_stack {
+  kmp_stack_block_t ts_first_block; // first block of stack entries
+  kmp_taskdata_t **ts_top; // pointer to the top of stack
+  kmp_int32 ts_entries; // number of entries on the stack
+} kmp_task_stack_t;
+
+#endif // BUILD_TIED_TASK_STACK
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
+  unsigned final : 1; /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+                              code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+                                     invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+                         context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+                                      setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned reserved : 9; /* reserved for compiler use */
+
+  /* Library flags */ /* Total library flags must be 16 bits */
+  unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1; /* 1==started, 0==not started     */
+  unsigned executing : 1; /* 1==executing, 0==not executing */
+  unsigned complete : 1; /* 1==complete, 0==not complete   */
+  unsigned freed : 1; /* 1==freed, 0==allocateed        */
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+
+} kmp_tasking_flags_t;
+
+struct kmp_taskdata { /* aligned during dynamic allocation       */
+  kmp_int32 td_task_id; /* id, assigned by debugger                */
+  kmp_tasking_flags_t td_flags; /* task flags                              */
+  kmp_team_t *td_team; /* team for this task                      */
+  kmp_info_p *td_alloc_thread; /* thread that allocated data structures   */
+  /* Currently not used except for perhaps IDB */
+  kmp_taskdata_t *td_parent; /* parent task                             */
+  kmp_int32 td_level; /* task nesting level                      */
+  std::atomic<kmp_int32> td_untied_count; // untied task active parts counter
+  ident_t *td_ident; /* task identifier                         */
+  // Taskwait data.
+  ident_t *td_taskwait_ident;
+  kmp_uint32 td_taskwait_counter;
+  kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
+  KMP_ALIGN_CACHE kmp_internal_control_t
+      td_icvs; /* Internal control variables for the task */
+  KMP_ALIGN_CACHE std::atomic<kmp_int32>
+      td_allocated_child_tasks; /* Child tasks (+ current task) not yet
+                                   deallocated */
+  std::atomic<kmp_int32>
+      td_incomplete_child_tasks; /* Child tasks not yet complete */
+  kmp_taskgroup_t
+      *td_taskgroup; // Each task keeps pointer to its current taskgroup
+  kmp_dephash_t
+      *td_dephash; // Dependencies for children tasks are tracked from here
+  kmp_depnode_t
+      *td_depnode; // Pointer to graph node if this task has dependencies
+  kmp_task_team_t *td_task_team;
+  kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
+#if defined(KMP_GOMP_COMPAT)
+  // 4 or 8 byte integers for the loop bounds in GOMP_taskloop
+  kmp_int32 td_size_loop_bounds;
+#endif
+  kmp_taskdata_t *td_last_tied; // keep tied task for task scheduling constraint
+#if defined(KMP_GOMP_COMPAT)
+  // GOMP sends in a copy function for copy constructors
+  void (*td_copy_func)(void *, void *);
+#endif
+  kmp_event_t td_allow_completion_event;
+#if OMPT_SUPPORT
+  ompt_task_info_t ompt_task_info;
+#endif
+}; // struct kmp_taskdata
+
+// Make sure padding above worked
+KMP_BUILD_ASSERT(sizeof(kmp_taskdata_t) % sizeof(void *) == 0);
+
+// Data for task team but per thread
+typedef struct kmp_base_thread_data {
+  kmp_info_p *td_thr; // Pointer back to thread info
+  // Used only in __kmp_execute_tasks_template, maybe not avail until task is
+  // queued?
+  kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque
+  kmp_taskdata_t *
+      *td_deque; // Deque of tasks encountered by td_thr, dynamically allocated
+  kmp_int32 td_deque_size; // Size of deck
+  kmp_uint32 td_deque_head; // Head of deque (will wrap)
+  kmp_uint32 td_deque_tail; // Tail of deque (will wrap)
+  kmp_int32 td_deque_ntasks; // Number of tasks in deque
+  // GEH: shouldn't this be volatile since used in while-spin?
+  kmp_int32 td_deque_last_stolen; // Thread number of last successful steal
+#ifdef BUILD_TIED_TASK_STACK
+  kmp_task_stack_t td_susp_tied_tasks; // Stack of suspended tied tasks for task
+// scheduling constraint
+#endif // BUILD_TIED_TASK_STACK
+} kmp_base_thread_data_t;
+
+#define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
+#define INITIAL_TASK_DEQUE_SIZE (1 << TASK_DEQUE_BITS)
+
+#define TASK_DEQUE_SIZE(td) ((td).td_deque_size)
+#define TASK_DEQUE_MASK(td) ((td).td_deque_size - 1)
+
+typedef union KMP_ALIGN_CACHE kmp_thread_data {
+  kmp_base_thread_data_t td;
+  double td_align; /* use worst case alignment */
+  char td_pad[KMP_PAD(kmp_base_thread_data_t, CACHE_LINE)];
+} kmp_thread_data_t;
+
+// Data for task teams which are used when tasking is enabled for the team
+typedef struct kmp_base_task_team {
+  kmp_bootstrap_lock_t
+      tt_threads_lock; /* Lock used to allocate per-thread part of task team */
+  /* must be bootstrap lock since used at library shutdown*/
+  kmp_task_team_t *tt_next; /* For linking the task team free list */
+  kmp_thread_data_t
+      *tt_threads_data; /* Array of per-thread structures for task team */
+  /* Data survives task team deallocation */
+  kmp_int32 tt_found_tasks; /* Have we found tasks and queued them while
+                               executing this team? */
+  /* TRUE means tt_threads_data is set up and initialized */
+  kmp_int32 tt_nproc; /* #threads in team           */
+  kmp_int32 tt_max_threads; // # entries allocated for threads_data array
+  kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier
+  kmp_int32 tt_untied_task_encountered;
+
+  KMP_ALIGN_CACHE
+  std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
+
+  KMP_ALIGN_CACHE
+  volatile kmp_uint32
+      tt_active; /* is the team still actively executing tasks */
+} kmp_base_task_team_t;
+
+union KMP_ALIGN_CACHE kmp_task_team {
+  kmp_base_task_team_t tt;
+  double tt_align; /* use worst case alignment */
+  char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
+};
+
+#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
+// Free lists keep same-size free memory slots for fast memory allocation
+// routines
+typedef struct kmp_free_list {
+  void *th_free_list_self; // Self-allocated tasks free list
+  void *th_free_list_sync; // Self-allocated tasks stolen/returned by other
+  // threads
+  void *th_free_list_other; // Non-self free list (to be returned to owner's
+  // sync list)
+} kmp_free_list_t;
+#endif
+#if KMP_NESTED_HOT_TEAMS
+// Hot teams array keeps hot teams and their sizes for given thread. Hot teams
+// are not put in teams pool, and they don't put threads in threads pool.
+typedef struct kmp_hot_team_ptr {
+  kmp_team_p *hot_team; // pointer to hot_team of given nesting level
+  kmp_int32 hot_team_nth; // number of threads allocated for the hot_team
+} kmp_hot_team_ptr_t;
+#endif
+typedef struct kmp_teams_size {
+  kmp_int32 nteams; // number of teams in a league
+  kmp_int32 nth; // number of threads in each team of the league
+} kmp_teams_size_t;
+
+// This struct stores a thread that acts as a "root" for a contention
+// group. Contention groups are rooted at kmp_root threads, but also at
+// each master thread of each team created in the teams construct.
+// This struct therefore also stores a thread_limit associated with
+// that contention group, and a counter to track the number of threads
+// active in that contention group. Each thread has a list of these: CG
+// root threads have an entry in their list in which cg_root refers to
+// the thread itself, whereas other workers in the CG will have a
+// single entry where cg_root is same as the entry containing their CG
+// root. When a thread encounters a teams construct, it will add a new
+// entry to the front of its list, because it now roots a new CG.
+typedef struct kmp_cg_root {
+  kmp_info_p *cg_root; // "root" thread for a contention group
+  // The CG root's limit comes from OMP_THREAD_LIMIT for root threads, or
+  // thread_limit clause for teams masters
+  kmp_int32 cg_thread_limit;
+  kmp_int32 cg_nthreads; // Count of active threads in CG rooted at cg_root
+  struct kmp_cg_root *up; // pointer to higher level CG root in list
+} kmp_cg_root_t;
+
+// OpenMP thread data structures
+
+typedef struct KMP_ALIGN_CACHE kmp_base_info {
+  /* Start with the readonly data which is cache aligned and padded. This is
+     written before the thread starts working by the master. Uber masters may
+     update themselves later. Usage does not consider serialized regions.  */
+  kmp_desc_t th_info;
+  kmp_team_p *th_team; /* team we belong to */
+  kmp_root_p *th_root; /* pointer to root of task hierarchy */
+  kmp_info_p *th_next_pool; /* next available thread in the pool */
+  kmp_disp_t *th_dispatch; /* thread's dispatch data */
+  int th_in_pool; /* in thread pool (32 bits for TCR/TCW) */
+
+  /* The following are cached from the team info structure */
+  /* TODO use these in more places as determined to be needed via profiling */
+  int th_team_nproc; /* number of threads in a team */
+  kmp_info_p *th_team_master; /* the team's master thread */
+  int th_team_serialized; /* team is serialized */
+  microtask_t th_teams_microtask; /* save entry address for teams construct */
+  int th_teams_level; /* save initial level of teams construct */
+/* it is 0 on device but may be any on host */
+
+/* The blocktime info is copied from the team struct to the thread sruct */
+/* at the start of a barrier, and the values stored in the team are used */
+/* at points in the code where the team struct is no longer guaranteed   */
+/* to exist (from the POV of worker threads).                            */
+#if KMP_USE_MONITOR
+  int th_team_bt_intervals;
+  int th_team_bt_set;
+#else
+  kmp_uint64 th_team_bt_intervals;
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
+#endif
+  omp_allocator_handle_t th_def_allocator; /* default allocator */
+  /* The data set by the master at reinit, then R/W by the worker */
+  KMP_ALIGN_CACHE int
+      th_set_nproc; /* if > 0, then only use this request for the next fork */
+#if KMP_NESTED_HOT_TEAMS
+  kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
+#endif
+  kmp_proc_bind_t
+      th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
+  kmp_teams_size_t
+      th_teams_size; /* number of teams/threads in teams construct */
+#if KMP_AFFINITY_SUPPORTED
+  int th_current_place; /* place currently bound to */
+  int th_new_place; /* place to bind to in par reg */
+  int th_first_place; /* first place in partition */
+  int th_last_place; /* last place in partition */
+#endif
+  int th_prev_level; /* previous level for affinity format */
+  int th_prev_num_threads; /* previous num_threads for affinity format */
+#if USE_ITT_BUILD
+  kmp_uint64 th_bar_arrive_time; /* arrival to barrier timestamp */
+  kmp_uint64 th_bar_min_time; /* minimum arrival time at the barrier */
+  kmp_uint64 th_frame_time; /* frame timestamp */
+#endif /* USE_ITT_BUILD */
+  kmp_local_t th_local;
+  struct private_common *th_pri_head;
+
+  /* Now the data only used by the worker (after initial allocation) */
+  /* TODO the first serial team should actually be stored in the info_t
+     structure.  this will help reduce initial allocation overhead */
+  KMP_ALIGN_CACHE kmp_team_p
+      *th_serial_team; /*serialized team held in reserve*/
+
+#if OMPT_SUPPORT
+  ompt_thread_info_t ompt_thread_info;
+#endif
+
+  /* The following are also read by the master during reinit */
+  struct common_table *th_pri_common;
+
+  volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
+  /* while awaiting queuing lock acquire */
+
+  volatile void *th_sleep_loc; // this points at a kmp_flag<T>
+
+  ident_t *th_ident;
+  unsigned th_x; // Random number generator data
+  unsigned th_a; // Random number generator data
+
+  /* Tasking-related data for the thread */
+  kmp_task_team_t *th_task_team; // Task team struct
+  kmp_taskdata_t *th_current_task; // Innermost Task being executed
+  kmp_uint8 th_task_state; // alternating 0/1 for task team identification
+  kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
+  // at nested levels
+  kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
+  kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
+  kmp_uint32 th_reap_state; // Non-zero indicates thread is not
+  // tasking, thus safe to reap
+
+  /* More stuff for keeping track of active/sleeping threads (this part is
+     written by the worker thread) */
+  kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
+  int th_active; // ! sleeping; 32 bits for TCR/TCW
+  struct cons_header *th_cons; // used for consistency check
+#if KMP_USE_HIER_SCHED
+  // used for hierarchical scheduling
+  kmp_hier_private_bdata_t *th_hier_bar_data;
+#endif
+
+  /* Add the syncronizing data which is cache aligned and padded. */
+  KMP_ALIGN_CACHE kmp_balign_t th_bar[bs_last_barrier];
+
+  KMP_ALIGN_CACHE volatile kmp_int32
+      th_next_waiting; /* gtid+1 of next thread on lock wait queue, 0 if none */
+
+#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
+#define NUM_LISTS 4
+  kmp_free_list_t th_free_lists[NUM_LISTS]; // Free lists for fast memory
+// allocation routines
+#endif
+
+#if KMP_OS_WINDOWS
+  kmp_win32_cond_t th_suspend_cv;
+  kmp_win32_mutex_t th_suspend_mx;
+  std::atomic<int> th_suspend_init;
+#endif
+#if KMP_OS_UNIX
+  kmp_cond_align_t th_suspend_cv;
+  kmp_mutex_align_t th_suspend_mx;
+  std::atomic<int> th_suspend_init_count;
+#endif
+
+#if USE_ITT_BUILD
+  kmp_itt_mark_t th_itt_mark_single;
+// alignment ???
+#endif /* USE_ITT_BUILD */
+#if KMP_STATS_ENABLED
+  kmp_stats_list *th_stats;
+#endif
+#if KMP_OS_UNIX
+  std::atomic<bool> th_blocking;
+#endif
+  kmp_cg_root_t *th_cg_roots; // list of cg_roots associated with this thread
+} kmp_base_info_t;
+
+typedef union KMP_ALIGN_CACHE kmp_info {
+  double th_align; /* use worst case alignment */
+  char th_pad[KMP_PAD(kmp_base_info_t, CACHE_LINE)];
+  kmp_base_info_t th;
+} kmp_info_t;
+
+// OpenMP thread team data structures
+
+typedef struct kmp_base_data { volatile kmp_uint32 t_value; } kmp_base_data_t;
+
+typedef union KMP_ALIGN_CACHE kmp_sleep_team {
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
+} kmp_sleep_team_t;
+
+typedef union KMP_ALIGN_CACHE kmp_ordered_team {
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
+} kmp_ordered_team_t;
+
+typedef int (*launch_t)(int gtid);
+
+/* Minimum number of ARGV entries to malloc if necessary */
+#define KMP_MIN_MALLOC_ARGV_ENTRIES 100
+
+// Set up how many argv pointers will fit in cache lines containing
+// t_inline_argv. Historically, we have supported at least 96 bytes. Using a
+// larger value for more space between the master write/worker read section and
+// read/write by all section seems to buy more performance on EPCC PARALLEL.
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_INLINE_ARGV_BYTES                                                  \
+  (4 * CACHE_LINE -                                                            \
+   ((3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) +               \
+     sizeof(kmp_int16) + sizeof(kmp_uint32)) %                                 \
+    CACHE_LINE))
+#else
+#define KMP_INLINE_ARGV_BYTES                                                  \
+  (2 * CACHE_LINE - ((3 * KMP_PTR_SKIP + 2 * sizeof(int)) % CACHE_LINE))
+#endif
+#define KMP_INLINE_ARGV_ENTRIES (int)(KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP)
+
+typedef struct KMP_ALIGN_CACHE kmp_base_team {
+  // Synchronization Data
+  // ---------------------------------------------------------------------------
+  KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
+  kmp_balign_team_t t_bar[bs_last_barrier];
+  std::atomic<int> t_construct; // count of single directive encountered by team
+  char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
+
+  // [0] - parallel / [1] - worksharing task reduction data shared by taskgroups
+  std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
+  std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
+
+  // Master only
+  // ---------------------------------------------------------------------------
+  KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
+  int t_master_this_cons; // "this_construct" single counter of master in parent
+  // team
+  ident_t *t_ident; // if volatile, have to change too much other crud to
+  // volatile too
+  kmp_team_p *t_parent; // parent team
+  kmp_team_p *t_next_pool; // next free team in the team pool
+  kmp_disp_t *t_dispatch; // thread's dispatch data
+  kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
+  kmp_proc_bind_t t_proc_bind; // bind type for par region
+#if USE_ITT_BUILD
+  kmp_uint64 t_region_time; // region begin timestamp
+#endif /* USE_ITT_BUILD */
+
+  // Master write, workers read
+  // --------------------------------------------------------------------------
+  KMP_ALIGN_CACHE void **t_argv;
+  int t_argc;
+  int t_nproc; // number of threads in team
+  microtask_t t_pkfn;
+  launch_t t_invoke; // procedure to launch the microtask
+
+#if OMPT_SUPPORT
+  ompt_team_info_t ompt_team_info;
+  ompt_lw_taskteam_t *ompt_serialized_team_info;
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  kmp_int8 t_fp_control_saved;
+  kmp_int8 t_pad2b;
+  kmp_int16 t_x87_fpu_control_word; // FP control regs
+  kmp_uint32 t_mxcsr;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  void *t_inline_argv[KMP_INLINE_ARGV_ENTRIES];
+
+  KMP_ALIGN_CACHE kmp_info_t **t_threads;
+  kmp_taskdata_t
+      *t_implicit_task_taskdata; // Taskdata for the thread's implicit task
+  int t_level; // nested parallel level
+
+  KMP_ALIGN_CACHE int t_max_argc;
+  int t_max_nproc; // max threads this team can handle (dynamicly expandable)
+  int t_serialized; // levels deep of serialized teams
+  dispatch_shared_info_t *t_disp_buffer; // buffers for dispatch system
+  int t_id; // team's id, assigned by debugger.
+  int t_active_level; // nested active parallel level
+  kmp_r_sched_t t_sched; // run-time schedule for the team
+#if KMP_AFFINITY_SUPPORTED
+  int t_first_place; // first & last place in parent thread's partition.
+  int t_last_place; // Restore these values to master after par region.
+#endif // KMP_AFFINITY_SUPPORTED
+  int t_display_affinity;
+  int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
+  // omp_set_num_threads() call
+  omp_allocator_handle_t t_def_allocator; /* default allocator */
+
+// Read/write by workers as well
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+  // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf
+  // regression of epcc 'parallel' and 'barrier' on fxe256lin01. This extra
+  // padding serves to fix the performance of epcc 'parallel' and 'barrier' when
+  // CACHE_LINE=64. TODO: investigate more and get rid if this padding.
+  char dummy_padding[1024];
+#endif
+  // Internal control stack for additional nested teams.
+  KMP_ALIGN_CACHE kmp_internal_control_t *t_control_stack_top;
+  // for SERIALIZED teams nested 2 or more levels deep
+  // typed flag to store request state of cancellation
+  std::atomic<kmp_int32> t_cancel_request;
+  int t_master_active; // save on fork, restore on join
+  void *t_copypriv_data; // team specific pointer to copyprivate data array
+#if KMP_OS_WINDOWS
+  std::atomic<kmp_uint32> t_copyin_counter;
+#endif
+#if USE_ITT_BUILD
+  void *t_stack_id; // team specific stack stitching id (for ittnotify)
+#endif /* USE_ITT_BUILD */
+} kmp_base_team_t;
+
+union KMP_ALIGN_CACHE kmp_team {
+  kmp_base_team_t t;
+  double t_align; /* use worst case alignment */
+  char t_pad[KMP_PAD(kmp_base_team_t, CACHE_LINE)];
+};
+
+typedef union KMP_ALIGN_CACHE kmp_time_global {
+  double dt_align; /* use worst case alignment */
+  char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
+  kmp_base_data_t dt;
+} kmp_time_global_t;
+
+typedef struct kmp_base_global {
+  /* cache-aligned */
+  kmp_time_global_t g_time;
+
+  /* non cache-aligned */
+  volatile int g_abort;
+  volatile int g_done;
+
+  int g_dynamic;
+  enum dynamic_mode g_dynamic_mode;
+} kmp_base_global_t;
+
+typedef union KMP_ALIGN_CACHE kmp_global {
+  kmp_base_global_t g;
+  double g_align; /* use worst case alignment */
+  char g_pad[KMP_PAD(kmp_base_global_t, CACHE_LINE)];
+} kmp_global_t;
+
+typedef struct kmp_base_root {
+  // TODO: GEH - combine r_active with r_in_parallel then r_active ==
+  // (r_in_parallel>= 0)
+  // TODO: GEH - then replace r_active with t_active_levels if we can to reduce
+  // the synch overhead or keeping r_active
+  volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
+  // keeps a count of active parallel regions per root
+  std::atomic<int> r_in_parallel;
+  // GEH: This is misnamed, should be r_active_levels
+  kmp_team_t *r_root_team;
+  kmp_team_t *r_hot_team;
+  kmp_info_t *r_uber_thread;
+  kmp_lock_t r_begin_lock;
+  volatile int r_begin;
+  int r_blocktime; /* blocktime for this root and descendants */
+} kmp_base_root_t;
+
+typedef union KMP_ALIGN_CACHE kmp_root {
+  kmp_base_root_t r;
+  double r_align; /* use worst case alignment */
+  char r_pad[KMP_PAD(kmp_base_root_t, CACHE_LINE)];
+} kmp_root_t;
+
+struct fortran_inx_info {
+  kmp_int32 data;
+};
+
+/* ------------------------------------------------------------------------ */
+
+extern int __kmp_settings;
+extern int __kmp_duplicate_library_ok;
+#if USE_ITT_BUILD
+extern int __kmp_forkjoin_frames;
+extern int __kmp_forkjoin_frames_mode;
+#endif
+extern PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method;
+extern int __kmp_determ_red;
+
+#ifdef KMP_DEBUG
+extern int kmp_a_debug;
+extern int kmp_b_debug;
+extern int kmp_c_debug;
+extern int kmp_d_debug;
+extern int kmp_e_debug;
+extern int kmp_f_debug;
+#endif /* KMP_DEBUG */
+
+/* For debug information logging using rotating buffer */
+#define KMP_DEBUG_BUF_LINES_INIT 512
+#define KMP_DEBUG_BUF_LINES_MIN 1
+
+#define KMP_DEBUG_BUF_CHARS_INIT 128
+#define KMP_DEBUG_BUF_CHARS_MIN 2
+
+extern int
+    __kmp_debug_buf; /* TRUE means use buffer, FALSE means print to stderr */
+extern int __kmp_debug_buf_lines; /* How many lines of debug stored in buffer */
+extern int
+    __kmp_debug_buf_chars; /* How many characters allowed per line in buffer */
+extern int __kmp_debug_buf_atomic; /* TRUE means use atomic update of buffer
+                                      entry pointer */
+
+extern char *__kmp_debug_buffer; /* Debug buffer itself */
+extern std::atomic<int> __kmp_debug_count; /* Counter for number of lines
+                                              printed in buffer so far */
+extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase
+                                          recommended in warnings */
+/* end rotating debug buffer */
+
+#ifdef KMP_DEBUG
+extern int __kmp_par_range; /* +1 => only go par for constructs in range */
+
+#define KMP_PAR_RANGE_ROUTINE_LEN 1024
+extern char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN];
+#define KMP_PAR_RANGE_FILENAME_LEN 1024
+extern char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN];
+extern int __kmp_par_range_lb;
+extern int __kmp_par_range_ub;
+#endif
+
+/* For printing out dynamic storage map for threads and teams */
+extern int
+    __kmp_storage_map; /* True means print storage map for threads and teams */
+extern int __kmp_storage_map_verbose; /* True means storage map includes
+                                         placement info */
+extern int __kmp_storage_map_verbose_specified;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+extern kmp_cpuinfo_t __kmp_cpuinfo;
+#endif
+
+extern volatile int __kmp_init_serial;
+extern volatile int __kmp_init_gtid;
+extern volatile int __kmp_init_common;
+extern volatile int __kmp_init_middle;
+extern volatile int __kmp_init_parallel;
+#if KMP_USE_MONITOR
+extern volatile int __kmp_init_monitor;
+#endif
+extern volatile int __kmp_init_user_locks;
+extern int __kmp_init_counter;
+extern int __kmp_root_counter;
+extern int __kmp_version;
+
+/* list of address of allocated caches for commons */
+extern kmp_cached_addr_t *__kmp_threadpriv_cache_list;
+
+/* Barrier algorithm types and options */
+extern kmp_uint32 __kmp_barrier_gather_bb_dflt;
+extern kmp_uint32 __kmp_barrier_release_bb_dflt;
+extern kmp_bar_pat_e __kmp_barrier_gather_pat_dflt;
+extern kmp_bar_pat_e __kmp_barrier_release_pat_dflt;
+extern kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier];
+extern kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier];
+extern kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier];
+extern kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier];
+extern char const *__kmp_barrier_branch_bit_env_name[bs_last_barrier];
+extern char const *__kmp_barrier_pattern_env_name[bs_last_barrier];
+extern char const *__kmp_barrier_type_name[bs_last_barrier];
+extern char const *__kmp_barrier_pattern_name[bp_last_bar];
+
+/* Global Locks */
+extern kmp_bootstrap_lock_t __kmp_initz_lock; /* control initialization */
+extern kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+extern kmp_bootstrap_lock_t __kmp_task_team_lock;
+extern kmp_bootstrap_lock_t
+    __kmp_exit_lock; /* exit() is not always thread-safe */
+#if KMP_USE_MONITOR
+extern kmp_bootstrap_lock_t
+    __kmp_monitor_lock; /* control monitor thread creation */
+#endif
+extern kmp_bootstrap_lock_t
+    __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and
+                             __kmp_threads expansion to co-exist */
+
+extern kmp_lock_t __kmp_global_lock; /* control OS/global access  */
+extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access  */
+extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */
+
+extern enum library_type __kmp_library;
+
+extern enum sched_type __kmp_sched; /* default runtime scheduling */
+extern enum sched_type __kmp_static; /* default static scheduling method */
+extern enum sched_type __kmp_guided; /* default guided scheduling method */
+extern enum sched_type __kmp_auto; /* default auto scheduling method */
+extern int __kmp_chunk; /* default runtime chunk size */
+
+extern size_t __kmp_stksize; /* stack size per thread         */
+#if KMP_USE_MONITOR
+extern size_t __kmp_monitor_stksize; /* stack size for monitor thread */
+#endif
+extern size_t __kmp_stkoffset; /* stack offset per thread       */
+extern int __kmp_stkpadding; /* Should we pad root thread(s) stack */
+
+extern size_t
+    __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */
+extern int __kmp_env_stksize; /* was KMP_STACKSIZE specified? */
+extern int __kmp_env_blocktime; /* was KMP_BLOCKTIME specified? */
+extern int __kmp_env_checks; /* was KMP_CHECKS specified?    */
+extern int __kmp_env_consistency_check; // was KMP_CONSISTENCY_CHECK specified?
+extern int __kmp_generate_warnings; /* should we issue warnings? */
+extern int __kmp_reserve_warn; /* have we issued reserve_threads warning? */
+
+#ifdef DEBUG_SUSPEND
+extern int __kmp_suspend_count; /* count inside __kmp_suspend_template() */
+#endif
+
+extern kmp_int32 __kmp_use_yield;
+extern kmp_int32 __kmp_use_yield_exp_set;
+extern kmp_uint32 __kmp_yield_init;
+extern kmp_uint32 __kmp_yield_next;
+
+/* ------------------------------------------------------------------------- */
+extern int __kmp_allThreadsSpecified;
+
+extern size_t __kmp_align_alloc;
+/* following data protected by initialization routines */
+extern int __kmp_xproc; /* number of processors in the system */
+extern int __kmp_avail_proc; /* number of processors available to the process */
+extern size_t __kmp_sys_min_stksize; /* system-defined minimum stack size */
+extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */
+// maximum total number of concurrently-existing threads on device
+extern int __kmp_max_nth;
+// maximum total number of concurrently-existing threads in a contention group
+extern int __kmp_cg_max_nth;
+extern int __kmp_teams_max_nth; // max threads used in a teams construct
+extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
+                                      __kmp_root */
+extern int __kmp_dflt_team_nth; /* default number of threads in a parallel
+                                   region a la OMP_NUM_THREADS */
+extern int __kmp_dflt_team_nth_ub; /* upper bound on "" determined at serial
+                                      initialization */
+extern int __kmp_tp_capacity; /* capacity of __kmp_threads if threadprivate is
+                                 used (fixed) */
+extern int __kmp_tp_cached; /* whether threadprivate cache has been created
+                               (__kmpc_threadprivate_cached()) */
+extern int __kmp_dflt_blocktime; /* number of milliseconds to wait before
+                                    blocking (env setting) */
+#if KMP_USE_MONITOR
+extern int
+    __kmp_monitor_wakeups; /* number of times monitor wakes up per second */
+extern int __kmp_bt_intervals; /* number of monitor timestamp intervals before
+                                  blocking */
+#endif
+#ifdef KMP_ADJUST_BLOCKTIME
+extern int __kmp_zero_bt; /* whether blocktime has been forced to zero */
+#endif /* KMP_ADJUST_BLOCKTIME */
+#ifdef KMP_DFLT_NTH_CORES
+extern int __kmp_ncores; /* Total number of cores for threads placement */
+#endif
+/* Number of millisecs to delay on abort for Intel(R) VTune(TM) tools */
+extern int __kmp_abort_delay;
+
+extern int __kmp_need_register_atfork_specified;
+extern int
+    __kmp_need_register_atfork; /* At initialization, call pthread_atfork to
+                                   install fork handler */
+extern int __kmp_gtid_mode; /* Method of getting gtid, values:
+                               0 - not set, will be set at runtime
+                               1 - using stack search
+                               2 - dynamic TLS (pthread_getspecific(Linux* OS/OS
+                                   X*) or TlsGetValue(Windows* OS))
+                               3 - static TLS (__declspec(thread) __kmp_gtid),
+                                   Linux* OS .so only.  */
+extern int
+    __kmp_adjust_gtid_mode; /* If true, adjust method based on #threads */
+#ifdef KMP_TDATA_GTID
+extern KMP_THREAD_LOCAL int __kmp_gtid;
+#endif
+extern int __kmp_tls_gtid_min; /* #threads below which use sp search for gtid */
+extern int __kmp_foreign_tp; // If true, separate TP var for each foreign thread
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+extern int __kmp_inherit_fp_control; // copy fp creg(s) parent->workers at fork
+extern kmp_int16 __kmp_init_x87_fpu_control_word; // init thread's FP ctrl reg
+extern kmp_uint32 __kmp_init_mxcsr; /* init thread's mxscr */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// max_active_levels for nested parallelism enabled by default via
+// OMP_MAX_ACTIVE_LEVELS, OMP_NESTED, OMP_NUM_THREADS, and OMP_PROC_BIND
+extern int __kmp_dflt_max_active_levels;
+// Indicates whether value of __kmp_dflt_max_active_levels was already
+// explicitly set by OMP_MAX_ACTIVE_LEVELS or OMP_NESTED=false
+extern bool __kmp_dflt_max_active_levels_set;
+extern int __kmp_dispatch_num_buffers; /* max possible dynamic loops in
+                                          concurrent execution per team */
+#if KMP_NESTED_HOT_TEAMS
+extern int __kmp_hot_teams_mode;
+extern int __kmp_hot_teams_max_level;
+#endif
+
+#if KMP_OS_LINUX
+extern enum clock_function_type __kmp_clock_function;
+extern int __kmp_clock_function_param;
+#endif /* KMP_OS_LINUX */
+
+#if KMP_MIC_SUPPORTED
+extern enum mic_type __kmp_mic_type;
+#endif
+
+#ifdef USE_LOAD_BALANCE
+extern double __kmp_load_balance_interval; // load balance algorithm interval
+#endif /* USE_LOAD_BALANCE */
+
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+  int *nth;
+  int size;
+  int used;
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// Parameters for the speculative lock backoff system.
+struct kmp_adaptive_backoff_params_t {
+  // Number of soft retries before it counts as a hard retry.
+  kmp_uint32 max_soft_retries;
+  // Badness is a bit mask : 0,1,3,7,15,... on each hard failure we move one to
+  // the right
+  kmp_uint32 max_badness;
+};
+
+extern kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+extern const char *__kmp_speculative_statsfile;
+#endif
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+extern int __kmp_display_env; /* TRUE or FALSE */
+extern int __kmp_display_env_verbose; /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
+extern int __kmp_omp_cancellation; /* TRUE or FALSE */
+
+/* ------------------------------------------------------------------------- */
+
+/* the following are protected by the fork/join lock */
+/* write: lock  read: anytime */
+extern kmp_info_t **__kmp_threads; /* Descriptors for the threads */
+/* read/write: lock */
+extern volatile kmp_team_t *__kmp_team_pool;
+extern volatile kmp_info_t *__kmp_thread_pool;
+extern kmp_info_t *__kmp_thread_pool_insert_pt;
+
+// total num threads reachable from some root thread including all root threads
+extern volatile int __kmp_nth;
+/* total number of threads reachable from some root thread including all root
+   threads, and those in the thread pool */
+extern volatile int __kmp_all_nth;
+extern std::atomic<int> __kmp_thread_pool_active_nth;
+
+extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
+/* end data protected by fork/join lock */
+/* ------------------------------------------------------------------------- */
+
+#define __kmp_get_gtid() __kmp_get_global_thread_id()
+#define __kmp_entry_gtid() __kmp_get_global_thread_id_reg()
+#define __kmp_get_tid() (__kmp_tid_from_gtid(__kmp_get_gtid()))
+#define __kmp_get_team() (__kmp_threads[(__kmp_get_gtid())]->th.th_team)
+#define __kmp_get_thread() (__kmp_thread_from_gtid(__kmp_get_gtid()))
+
+// AT: Which way is correct?
+// AT: 1. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc;
+// AT: 2. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team_nproc;
+#define __kmp_get_team_num_threads(gtid)                                       \
+  (__kmp_threads[(gtid)]->th.th_team->t.t_nproc)
+
+static inline bool KMP_UBER_GTID(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= KMP_GTID_MIN);
+  KMP_DEBUG_ASSERT(gtid < __kmp_threads_capacity);
+  return (gtid >= 0 && __kmp_root[gtid] && __kmp_threads[gtid] &&
+          __kmp_threads[gtid] == __kmp_root[gtid]->r.r_uber_thread);
+}
+
+static inline int __kmp_tid_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid]->th.th_info.ds.ds_tid;
+}
+
+static inline int __kmp_gtid_from_tid(int tid, const kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(tid >= 0 && team);
+  return team->t.t_threads[tid]->th.th_info.ds.ds_gtid;
+}
+
+static inline int __kmp_gtid_from_thread(const kmp_info_t *thr) {
+  KMP_DEBUG_ASSERT(thr);
+  return thr->th.th_info.ds.ds_gtid;
+}
+
+static inline kmp_info_t *__kmp_thread_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid];
+}
+
+static inline kmp_team_t *__kmp_team_from_gtid(int gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  return __kmp_threads[gtid]->th.th_team;
+}
+
+/* ------------------------------------------------------------------------- */
+
+extern kmp_global_t __kmp_global; /* global status */
+
+extern kmp_info_t __kmp_monitor;
+// For Debugging Support Library
+extern std::atomic<kmp_int32> __kmp_team_counter;
+// For Debugging Support Library
+extern std::atomic<kmp_int32> __kmp_task_counter;
+
+#if USE_DEBUGGER
+#define _KMP_GEN_ID(counter)                                                   \
+  (__kmp_debugging ? KMP_ATOMIC_INC(&counter) + 1 : ~0)
+#else
+#define _KMP_GEN_ID(counter) (~0)
+#endif /* USE_DEBUGGER */
+
+#define KMP_GEN_TASK_ID() _KMP_GEN_ID(__kmp_task_counter)
+#define KMP_GEN_TEAM_ID() _KMP_GEN_ID(__kmp_team_counter)
+
+/* ------------------------------------------------------------------------ */
+
+extern void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2,
+                                         size_t size, char const *format, ...);
+
+extern void __kmp_serial_initialize(void);
+extern void __kmp_middle_initialize(void);
+extern void __kmp_parallel_initialize(void);
+
+extern void __kmp_internal_begin(void);
+extern void __kmp_internal_end_library(int gtid);
+extern void __kmp_internal_end_thread(int gtid);
+extern void __kmp_internal_end_atexit(void);
+extern void __kmp_internal_end_fini(void);
+extern void __kmp_internal_end_dtor(void);
+extern void __kmp_internal_end_dest(void *);
+
+extern int __kmp_register_root(int initial_thread);
+extern void __kmp_unregister_root(int gtid);
+
+extern int __kmp_ignore_mppbeg(void);
+extern int __kmp_ignore_mppend(void);
+
+extern int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws);
+extern void __kmp_exit_single(int gtid);
+
+extern void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+extern void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+
+#ifdef USE_LOAD_BALANCE
+extern int __kmp_get_load_balance(int);
+#endif
+
+extern int __kmp_get_global_thread_id(void);
+extern int __kmp_get_global_thread_id_reg(void);
+extern void __kmp_exit_thread(int exit_status);
+extern void __kmp_abort(char const *format, ...);
+extern void __kmp_abort_thread(void);
+KMP_NORETURN extern void __kmp_abort_process(void);
+extern void __kmp_warn(char const *format, ...);
+
+extern void __kmp_set_num_threads(int new_nth, int gtid);
+
+// Returns current thread (pointer to kmp_info_t). Current thread *must* be
+// registered.
+static inline kmp_info_t *__kmp_entry_thread() {
+  int gtid = __kmp_entry_gtid();
+
+  return __kmp_threads[gtid];
+}
+
+extern void __kmp_set_max_active_levels(int gtid, int new_max_active_levels);
+extern int __kmp_get_max_active_levels(int gtid);
+extern int __kmp_get_ancestor_thread_num(int gtid, int level);
+extern int __kmp_get_team_size(int gtid, int level);
+extern void __kmp_set_schedule(int gtid, kmp_sched_t new_sched, int chunk);
+extern void __kmp_get_schedule(int gtid, kmp_sched_t *sched, int *chunk);
+
+extern unsigned short __kmp_get_random(kmp_info_t *thread);
+extern void __kmp_init_random(kmp_info_t *thread);
+
+extern kmp_r_sched_t __kmp_get_schedule_global(void);
+extern void __kmp_adjust_num_threads(int new_nproc);
+extern void __kmp_check_stksize(size_t *val);
+
+extern void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL);
+extern void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_free(void *ptr KMP_SRC_LOC_DECL);
+#define __kmp_allocate(size) ___kmp_allocate((size)KMP_SRC_LOC_CURR)
+#define __kmp_page_allocate(size) ___kmp_page_allocate((size)KMP_SRC_LOC_CURR)
+#define __kmp_free(ptr) ___kmp_free((ptr)KMP_SRC_LOC_CURR)
+
+#if USE_FAST_MEMORY
+extern void *___kmp_fast_allocate(kmp_info_t *this_thr,
+                                  size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL);
+extern void __kmp_free_fast_memory(kmp_info_t *this_thr);
+extern void __kmp_initialize_fast_memory(kmp_info_t *this_thr);
+#define __kmp_fast_allocate(this_thr, size)                                    \
+  ___kmp_fast_allocate((this_thr), (size)KMP_SRC_LOC_CURR)
+#define __kmp_fast_free(this_thr, ptr)                                         \
+  ___kmp_fast_free((this_thr), (ptr)KMP_SRC_LOC_CURR)
+#endif
+
+extern void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL);
+extern void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
+                                  size_t elsize KMP_SRC_LOC_DECL);
+extern void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
+                                   size_t size KMP_SRC_LOC_DECL);
+extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
+#define __kmp_thread_malloc(th, size)                                          \
+  ___kmp_thread_malloc((th), (size)KMP_SRC_LOC_CURR)
+#define __kmp_thread_calloc(th, nelem, elsize)                                 \
+  ___kmp_thread_calloc((th), (nelem), (elsize)KMP_SRC_LOC_CURR)
+#define __kmp_thread_realloc(th, ptr, size)                                    \
+  ___kmp_thread_realloc((th), (ptr), (size)KMP_SRC_LOC_CURR)
+#define __kmp_thread_free(th, ptr)                                             \
+  ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
+
+#define KMP_INTERNAL_MALLOC(sz) malloc(sz)
+#define KMP_INTERNAL_FREE(p) free(p)
+#define KMP_INTERNAL_REALLOC(p, sz) realloc((p), (sz))
+#define KMP_INTERNAL_CALLOC(n, sz) calloc((n), (sz))
+
+extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+
+extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
+                                 kmp_proc_bind_t proc_bind);
+extern void __kmp_push_num_teams(ident_t *loc, int gtid, int num_teams,
+                                 int num_threads);
+
+extern void __kmp_yield();
+
+extern void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                   enum sched_type schedule, kmp_int32 lb,
+                                   kmp_int32 ub, kmp_int32 st, kmp_int32 chunk);
+extern void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                    enum sched_type schedule, kmp_uint32 lb,
+                                    kmp_uint32 ub, kmp_int32 st,
+                                    kmp_int32 chunk);
+extern void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                   enum sched_type schedule, kmp_int64 lb,
+                                   kmp_int64 ub, kmp_int64 st, kmp_int64 chunk);
+extern void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                    enum sched_type schedule, kmp_uint64 lb,
+                                    kmp_uint64 ub, kmp_int64 st,
+                                    kmp_int64 chunk);
+
+extern int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *p_last, kmp_int32 *p_lb,
+                                  kmp_int32 *p_ub, kmp_int32 *p_st);
+extern int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, kmp_uint32 *p_lb,
+                                   kmp_uint32 *p_ub, kmp_int32 *p_st);
+extern int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *p_last, kmp_int64 *p_lb,
+                                  kmp_int64 *p_ub, kmp_int64 *p_st);
+extern int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, kmp_uint64 *p_lb,
+                                   kmp_uint64 *p_ub, kmp_int64 *p_st);
+
+extern void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid);
+
+#ifdef KMP_GOMP_COMPAT
+
+extern void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                      enum sched_type schedule, kmp_int32 lb,
+                                      kmp_int32 ub, kmp_int32 st,
+                                      kmp_int32 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                       enum sched_type schedule, kmp_uint32 lb,
+                                       kmp_uint32 ub, kmp_int32 st,
+                                       kmp_int32 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                      enum sched_type schedule, kmp_int64 lb,
+                                      kmp_int64 ub, kmp_int64 st,
+                                      kmp_int64 chunk, int push_ws);
+extern void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                       enum sched_type schedule, kmp_uint64 lb,
+                                       kmp_uint64 ub, kmp_int64 st,
+                                       kmp_int64 chunk, int push_ws);
+extern void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid);
+extern void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid);
+
+#endif /* KMP_GOMP_COMPAT */
+
+extern kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker);
+extern kmp_uint32 __kmp_wait_4(kmp_uint32 volatile *spinner, kmp_uint32 checker,
+                               kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+                               void *obj);
+extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+                             kmp_uint32 (*pred)(void *, kmp_uint32), void *obj);
+
+class kmp_flag_32;
+class kmp_flag_64;
+class kmp_flag_oncore;
+extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag,
+                          int final_spin
+#if USE_ITT_BUILD
+                          ,
+                          void *itt_sync_obj
+#endif
+                          );
+extern void __kmp_release_64(kmp_flag_64 *flag);
+
+extern void __kmp_infinite_loop(void);
+
+extern void __kmp_cleanup(void);
+
+#if KMP_HANDLE_SIGNALS
+extern int __kmp_handle_signals;
+extern void __kmp_install_signals(int parallel_init);
+extern void __kmp_remove_signals(void);
+#endif
+
+extern void __kmp_clear_system_time(void);
+extern void __kmp_read_system_time(double *delta);
+
+extern void __kmp_check_stack_overlap(kmp_info_t *thr);
+
+extern void __kmp_expand_host_name(char *buffer, size_t size);
+extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+extern void
+__kmp_initialize_system_tick(void); /* Initialize timer tick value */
+#endif
+
+extern void
+__kmp_runtime_initialize(void); /* machine specific initialization */
+extern void __kmp_runtime_destroy(void);
+
+#if KMP_AFFINITY_SUPPORTED
+extern char *__kmp_affinity_print_mask(char *buf, int buf_len,
+                                       kmp_affin_mask_t *mask);
+extern kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
+                                                  kmp_affin_mask_t *mask);
+extern void __kmp_affinity_initialize(void);
+extern void __kmp_affinity_uninitialize(void);
+extern void __kmp_affinity_set_init_mask(
+    int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
+extern void __kmp_affinity_set_place(int gtid);
+extern void __kmp_affinity_determine_capable(const char *env_var);
+extern int __kmp_aux_set_affinity(void **mask);
+extern int __kmp_aux_get_affinity(void **mask);
+extern int __kmp_aux_get_affinity_max_proc();
+extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
+extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
+extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
+extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
+#if KMP_OS_LINUX
+extern int kmp_set_thread_affinity_mask_initial(void);
+#endif
+#endif /* KMP_AFFINITY_SUPPORTED */
+// No need for KMP_AFFINITY_SUPPORTED guard as only one field in the
+// format string is for affinity, so platforms that do not support
+// affinity can still use the other fields, e.g., %n for num_threads
+extern size_t __kmp_aux_capture_affinity(int gtid, const char *format,
+                                         kmp_str_buf_t *buffer);
+extern void __kmp_aux_display_affinity(int gtid, const char *format);
+
+extern void __kmp_cleanup_hierarchy();
+extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar);
+
+#if KMP_USE_FUTEX
+
+extern int __kmp_futex_determine_capable(void);
+
+#endif // KMP_USE_FUTEX
+
+extern void __kmp_gtid_set_specific(int gtid);
+extern int __kmp_gtid_get_specific(void);
+
+extern double __kmp_read_cpu_time(void);
+
+extern int __kmp_read_system_info(struct kmp_sys_info *info);
+
+#if KMP_USE_MONITOR
+extern void __kmp_create_monitor(kmp_info_t *th);
+#endif
+
+extern void *__kmp_launch_thread(kmp_info_t *thr);
+
+extern void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size);
+
+#if KMP_OS_WINDOWS
+extern int __kmp_still_running(kmp_info_t *th);
+extern int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val);
+extern void __kmp_free_handle(kmp_thread_t tHandle);
+#endif
+
+#if KMP_USE_MONITOR
+extern void __kmp_reap_monitor(kmp_info_t *th);
+#endif
+extern void __kmp_reap_worker(kmp_info_t *th);
+extern void __kmp_terminate_thread(int gtid);
+
+extern int __kmp_try_suspend_mx(kmp_info_t *th);
+extern void __kmp_lock_suspend_mx(kmp_info_t *th);
+extern void __kmp_unlock_suspend_mx(kmp_info_t *th);
+
+extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag);
+extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag);
+extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
+extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag);
+extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag);
+extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
+
+extern void __kmp_elapsed(double *);
+extern void __kmp_elapsed_tick(double *);
+
+extern void __kmp_enable(int old_state);
+extern void __kmp_disable(int *old_state);
+
+extern void __kmp_thread_sleep(int millis);
+
+extern void __kmp_common_initialize(void);
+extern void __kmp_common_destroy(void);
+extern void __kmp_common_destroy_gtid(int gtid);
+
+#if KMP_OS_UNIX
+extern void __kmp_register_atfork(void);
+#endif
+extern void __kmp_suspend_initialize(void);
+extern void __kmp_suspend_initialize_thread(kmp_info_t *th);
+extern void __kmp_suspend_uninitialize_thread(kmp_info_t *th);
+
+extern kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
+                                         int tid);
+extern kmp_team_t *
+__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                    ompt_data_t ompt_parallel_data,
+#endif
+                    kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs,
+                    int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
+extern void __kmp_free_thread(kmp_info_t *);
+extern void __kmp_free_team(kmp_root_t *,
+                            kmp_team_t *USE_NESTED_HOT_ARG(kmp_info_t *));
+extern kmp_team_t *__kmp_reap_team(kmp_team_t *);
+
+/* ------------------------------------------------------------------------ */
+
+extern void __kmp_initialize_bget(kmp_info_t *th);
+extern void __kmp_finalize_bget(kmp_info_t *th);
+
+KMP_EXPORT void *kmpc_malloc(size_t size);
+KMP_EXPORT void *kmpc_aligned_malloc(size_t size, size_t alignment);
+KMP_EXPORT void *kmpc_calloc(size_t nelem, size_t elsize);
+KMP_EXPORT void *kmpc_realloc(void *ptr, size_t size);
+KMP_EXPORT void kmpc_free(void *ptr);
+
+/* declarations for internal use */
+
+extern int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
+                         size_t reduce_size, void *reduce_data,
+                         void (*reduce)(void *, void *));
+extern void __kmp_end_split_barrier(enum barrier_type bt, int gtid);
+extern int __kmp_barrier_gomp_cancel(int gtid);
+
+/*!
+ * Tell the fork call which compiler generated the fork call, and therefore how
+ * to deal with the call.
+ */
+enum fork_context_e {
+  fork_context_gnu, /**< Called from GNU generated code, so must not invoke the
+                       microtask internally. */
+  fork_context_intel, /**< Called from Intel generated code.  */
+  fork_context_last
+};
+extern int __kmp_fork_call(ident_t *loc, int gtid,
+                           enum fork_context_e fork_context, kmp_int32 argc,
+                           microtask_t microtask, launch_t invoker,
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                           va_list *ap
+#else
+                           va_list ap
+#endif
+                           );
+
+extern void __kmp_join_call(ident_t *loc, int gtid
+#if OMPT_SUPPORT
+                            ,
+                            enum fork_context_e fork_context
+#endif
+                            ,
+                            int exit_teams = 0);
+
+extern void __kmp_serialized_parallel(ident_t *id, kmp_int32 gtid);
+extern void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team);
+extern void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team);
+extern int __kmp_invoke_task_func(int gtid);
+extern void __kmp_run_before_invoked_task(int gtid, int tid,
+                                          kmp_info_t *this_thr,
+                                          kmp_team_t *team);
+extern void __kmp_run_after_invoked_task(int gtid, int tid,
+                                         kmp_info_t *this_thr,
+                                         kmp_team_t *team);
+
+// should never have been exported
+KMP_EXPORT int __kmpc_invoke_task_func(int gtid);
+extern int __kmp_invoke_teams_master(int gtid);
+extern void __kmp_teams_master(int gtid);
+extern int __kmp_aux_get_team_num();
+extern int __kmp_aux_get_num_teams();
+extern void __kmp_save_internal_controls(kmp_info_t *thread);
+extern void __kmp_user_set_library(enum library_type arg);
+extern void __kmp_aux_set_library(enum library_type arg);
+extern void __kmp_aux_set_stacksize(size_t arg);
+extern void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid);
+extern void __kmp_aux_set_defaults(char const *str, int len);
+
+/* Functions called from __kmp_aux_env_initialize() in kmp_settings.cpp */
+void kmpc_set_blocktime(int arg);
+void ompc_set_nested(int flag);
+void ompc_set_dynamic(int flag);
+void ompc_set_num_threads(int arg);
+
+extern void __kmp_push_current_task_to_thread(kmp_info_t *this_thr,
+                                              kmp_team_t *team, int tid);
+extern void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr);
+extern kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_tasking_flags_t *flags,
+                                    size_t sizeof_kmp_task_t,
+                                    size_t sizeof_shareds,
+                                    kmp_routine_entry_t task_entry);
+extern void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
+                                     kmp_team_t *team, int tid,
+                                     int set_curr_task);
+extern void __kmp_finish_implicit_task(kmp_info_t *this_thr);
+extern void __kmp_free_implicit_task(kmp_info_t *this_thr);
+
+extern kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref,
+                                                       int gtid,
+                                                       kmp_task_t *task);
+extern void __kmp_fulfill_event(kmp_event_t *event);
+
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_32 *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
+                           kmp_flag_64 *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
+                               kmp_flag_oncore *flag, int final_spin,
+                               int *thread_finished,
+#if USE_ITT_BUILD
+                               void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                               kmp_int32 is_constrained);
+
+extern void __kmp_free_task_team(kmp_info_t *thread,
+                                 kmp_task_team_t *task_team);
+extern void __kmp_reap_task_teams(void);
+extern void __kmp_wait_to_unref_task_teams(void);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
+                                  int always);
+extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
+extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
+#if USE_ITT_BUILD
+                                 ,
+                                 void *itt_sync_obj
+#endif /* USE_ITT_BUILD */
+                                 ,
+                                 int wait = 1);
+extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
+                                  int gtid);
+
+extern int __kmp_is_address_mapped(void *addr);
+extern kmp_uint64 __kmp_hardware_timestamp(void);
+
+#if KMP_OS_UNIX
+extern int __kmp_read_from_file(char const *path, char const *format, ...);
+#endif
+
+/* ------------------------------------------------------------------------ */
+//
+// Assembly routines that have no compiler intrinsic replacement
+//
+
+extern int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int npr, int argc,
+                                  void *argv[]
+#if OMPT_SUPPORT
+                                  ,
+                                  void **exit_frame_ptr
+#endif
+                                  );
+
+/* ------------------------------------------------------------------------ */
+
+KMP_EXPORT void __kmpc_begin(ident_t *, kmp_int32 flags);
+KMP_EXPORT void __kmpc_end(ident_t *);
+
+KMP_EXPORT void __kmpc_threadprivate_register_vec(ident_t *, void *data,
+                                                  kmpc_ctor_vec ctor,
+                                                  kmpc_cctor_vec cctor,
+                                                  kmpc_dtor_vec dtor,
+                                                  size_t vector_length);
+KMP_EXPORT void __kmpc_threadprivate_register(ident_t *, void *data,
+                                              kmpc_ctor ctor, kmpc_cctor cctor,
+                                              kmpc_dtor dtor);
+KMP_EXPORT void *__kmpc_threadprivate(ident_t *, kmp_int32 global_tid,
+                                      void *data, size_t size);
+
+KMP_EXPORT kmp_int32 __kmpc_global_thread_num(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_global_num_threads(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_bound_thread_num(ident_t *);
+KMP_EXPORT kmp_int32 __kmpc_bound_num_threads(ident_t *);
+
+KMP_EXPORT kmp_int32 __kmpc_ok_to_fork(ident_t *);
+KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs,
+                                 kmpc_micro microtask, ...);
+
+KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT void __kmpc_flush(ident_t *);
+KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,
+                                kmp_critical_name *);
+KMP_EXPORT void __kmpc_end_critical(ident_t *, kmp_int32 global_tid,
+                                    kmp_critical_name *);
+KMP_EXPORT void __kmpc_critical_with_hint(ident_t *, kmp_int32 global_tid,
+                                          kmp_critical_name *, uint32_t hint);
+
+KMP_EXPORT kmp_int32 __kmpc_barrier_master(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *,
+                                                  kmp_int32 global_tid);
+
+KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
+
+KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid,
+                                     kmp_int32 schedtype, kmp_int32 *plastiter,
+                                     kmp_int *plower, kmp_int *pupper,
+                                     kmp_int *pstride, kmp_int incr,
+                                     kmp_int chunk);
+
+KMP_EXPORT void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
+
+KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
+                                   size_t cpy_size, void *cpy_data,
+                                   void (*cpy_func)(void *, void *),
+                                   kmp_int32 didit);
+
+extern void KMPC_SET_NUM_THREADS(int arg);
+extern void KMPC_SET_DYNAMIC(int flag);
+extern void KMPC_SET_NESTED(int flag);
+
+/* OMP 3.0 tasking interface routines */
+KMP_EXPORT kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
+                                     kmp_task_t *new_task);
+KMP_EXPORT kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                             kmp_int32 flags,
+                                             size_t sizeof_kmp_task_t,
+                                             size_t sizeof_shareds,
+                                             kmp_routine_entry_t task_entry);
+KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                                    kmp_int32 flags,
+                                                    size_t sizeof_kmp_task_t,
+                                                    size_t sizeof_shareds,
+                                                    kmp_routine_entry_t task_entry,
+                                                    kmp_int64 device_id);
+KMP_EXPORT void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                          kmp_task_t *task);
+KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                             kmp_task_t *task);
+KMP_EXPORT kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
+                                           kmp_task_t *new_task);
+KMP_EXPORT kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid);
+
+KMP_EXPORT kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid,
+                                          int end_part);
+
+#if TASK_UNUSED
+void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task);
+void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
+                              kmp_task_t *task);
+#endif // TASK_UNUSED
+
+/* ------------------------------------------------------------------------ */
+
+KMP_EXPORT void __kmpc_taskgroup(ident_t *loc, int gtid);
+KMP_EXPORT void __kmpc_end_taskgroup(ident_t *loc, int gtid);
+
+KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 ndeps,
+    kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
+    kmp_depend_info_t *noalias_dep_list);
+KMP_EXPORT void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid,
+                                     kmp_int32 ndeps,
+                                     kmp_depend_info_t *dep_list,
+                                     kmp_int32 ndeps_noalias,
+                                     kmp_depend_info_t *noalias_dep_list);
+extern kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
+                                bool serialize_immediate);
+
+KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t *loc_ref, kmp_int32 gtid,
+                                              kmp_int32 cncl_kind);
+KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t *loc_ref, kmp_int32 gtid);
+KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
+
+KMP_EXPORT void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask);
+KMP_EXPORT void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask);
+KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task,
+                                kmp_int32 if_val, kmp_uint64 *lb,
+                                kmp_uint64 *ub, kmp_int64 st, kmp_int32 nogroup,
+                                kmp_int32 sched, kmp_uint64 grainsize,
+                                void *task_dup);
+KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data);
+KMP_EXPORT void *__kmpc_taskred_init(int gtid, int num_data, void *data);
+KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
+KMP_EXPORT void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid,
+                                                     int is_ws, int num,
+                                                     void *data);
+KMP_EXPORT void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                              int num, void *data);
+KMP_EXPORT void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid,
+                                                    int is_ws);
+KMP_EXPORT kmp_int32 __kmpc_omp_reg_task_with_affinity(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins,
+    kmp_task_affinity_info_t *affin_list);
+
+/* Lock interface routines (fast versions with gtid passed in) */
+KMP_EXPORT void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid,
+                                 void **user_lock);
+KMP_EXPORT void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                      void **user_lock);
+KMP_EXPORT void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid,
+                                    void **user_lock);
+KMP_EXPORT void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                         void **user_lock);
+KMP_EXPORT void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
+KMP_EXPORT void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock);
+KMP_EXPORT void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid,
+                                  void **user_lock);
+KMP_EXPORT void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                       void **user_lock);
+KMP_EXPORT int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
+KMP_EXPORT int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock);
+
+KMP_EXPORT void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                           void **user_lock, uintptr_t hint);
+KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                                void **user_lock,
+                                                uintptr_t hint);
+
+/* Interface to fast scalable reduce methods routines */
+
+KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+KMP_EXPORT void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_critical_name *lck);
+KMP_EXPORT kmp_int32 __kmpc_reduce(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+KMP_EXPORT void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_critical_name *lck);
+
+/* Internal fast reduction routines */
+
+extern PACKED_REDUCTION_METHOD_T __kmp_determine_reduction_method(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck);
+
+// this function is for testing set/get/determine reduce method
+KMP_EXPORT kmp_int32 __kmp_get_reduce_method(void);
+
+KMP_EXPORT kmp_uint64 __kmpc_get_taskid();
+KMP_EXPORT kmp_uint64 __kmpc_get_parent_taskid();
+
+// C++ port
+// missing 'extern "C"' declarations
+
+KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
+KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
+KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
+                                        kmp_int32 num_threads);
+
+KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
+                                      int proc_bind);
+KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
+                                      kmp_int32 num_teams,
+                                      kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc,
+                                  kmpc_micro microtask, ...);
+struct kmp_dim { // loop bounds info casted to kmp_int64
+  kmp_int64 lo; // lower
+  kmp_int64 up; // upper
+  kmp_int64 st; // stride
+};
+KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
+                                     kmp_int32 num_dims,
+                                     const struct kmp_dim *dims);
+KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid,
+                                     const kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid,
+                                     const kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
+
+KMP_EXPORT void *__kmpc_threadprivate_cached(ident_t *loc, kmp_int32 global_tid,
+                                             void *data, size_t size,
+                                             void ***cache);
+
+// Symbols for MS mutual detection.
+extern int _You_must_link_with_exactly_one_OpenMP_library;
+extern int _You_must_link_with_Intel_OpenMP_library;
+#if KMP_OS_WINDOWS && (KMP_VERSION_MAJOR > 4)
+extern int _You_must_link_with_Microsoft_OpenMP_library;
+#endif
+
+// The routines below are not exported.
+// Consider making them 'static' in corresponding source files.
+void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
+                                           void *data_addr, size_t pc_size);
+struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
+                                                void *data_addr,
+                                                size_t pc_size);
+void __kmp_threadprivate_resize_cache(int newCapacity);
+void __kmp_cleanup_threadprivate_caches();
+
+// ompc_, kmpc_ entries moved from omp.h.
+#if KMP_OS_WINDOWS
+#define KMPC_CONVENTION __cdecl
+#else
+#define KMPC_CONVENTION
+#endif
+
+#ifndef __OMP_H
+typedef enum omp_sched_t {
+  omp_sched_static = 1,
+  omp_sched_dynamic = 2,
+  omp_sched_guided = 3,
+  omp_sched_auto = 4
+} omp_sched_t;
+typedef void *kmp_affinity_mask_t;
+#endif
+
+KMP_EXPORT void KMPC_CONVENTION ompc_set_max_active_levels(int);
+KMP_EXPORT void KMPC_CONVENTION ompc_set_schedule(omp_sched_t, int);
+KMP_EXPORT int KMPC_CONVENTION ompc_get_ancestor_thread_num(int);
+KMP_EXPORT int KMPC_CONVENTION ompc_get_team_size(int);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_set_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *);
+KMP_EXPORT int KMPC_CONVENTION
+kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *);
+
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize(int);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_defaults(char const *);
+KMP_EXPORT void KMPC_CONVENTION kmpc_set_disp_num_buffers(int);
+
+enum kmp_target_offload_kind {
+  tgt_disabled = 0,
+  tgt_default = 1,
+  tgt_mandatory = 2
+};
+typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
+// Set via OMP_TARGET_OFFLOAD if specified, defaults to tgt_default otherwise
+extern kmp_target_offload_kind_t __kmp_target_offload;
+extern int __kmpc_get_target_offload();
+
+// Constants used in libomptarget
+#define KMP_DEVICE_DEFAULT -1 // This is libomptarget's default device.
+#define KMP_HOST_DEVICE -10 // This is what it is in libomptarget, go figure.
+#define KMP_DEVICE_ALL -11 // This is libomptarget's "all devices".
+
+// OMP Pause Resource
+
+// The following enum is used both to set the status in __kmp_pause_status, and
+// as the internal equivalent of the externally-visible omp_pause_resource_t.
+typedef enum kmp_pause_status_t {
+  kmp_not_paused = 0, // status is not paused, or, requesting resume
+  kmp_soft_paused = 1, // status is soft-paused, or, requesting soft pause
+  kmp_hard_paused = 2 // status is hard-paused, or, requesting hard pause
+} kmp_pause_status_t;
+
+// This stores the pause state of the runtime
+extern kmp_pause_status_t __kmp_pause_status;
+extern int __kmpc_pause_resource(kmp_pause_status_t level);
+extern int __kmp_pause_resource(kmp_pause_status_t level);
+// Soft resume sets __kmp_pause_status, and wakes up all threads.
+extern void __kmp_resume_if_soft_paused();
+// Hard resume simply resets the status to not paused. Library will appear to
+// be uninitialized after hard pause. Let OMP constructs trigger required
+// initializations.
+static inline void __kmp_resume_if_hard_paused() {
+  if (__kmp_pause_status == kmp_hard_paused) {
+    __kmp_pause_status = kmp_not_paused;
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* KMP_H */
diff --git a/final/runtime/src/kmp_affinity.cpp b/final/runtime/src/kmp_affinity.cpp
new file mode 100644
index 0000000..372c300
--- /dev/null
+++ b/final/runtime/src/kmp_affinity.cpp
@@ -0,0 +1,5342 @@
+/*
+ * kmp_affinity.cpp -- affinity management
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_str.h"
+#include "kmp_wrapper_getpid.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+
+// Store the real or imagined machine hierarchy here
+static hierarchy_info machine_hierarchy;
+
+void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
+
+void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
+  kmp_uint32 depth;
+  // The test below is true if affinity is available, but set to "none". Need to
+  // init on first use of hierarchical barrier.
+  if (TCR_1(machine_hierarchy.uninitialized))
+    machine_hierarchy.init(NULL, nproc);
+
+  // Adjust the hierarchy in case num threads exceeds original
+  if (nproc > machine_hierarchy.base_num_threads)
+    machine_hierarchy.resize(nproc);
+
+  depth = machine_hierarchy.depth;
+  KMP_DEBUG_ASSERT(depth > 0);
+
+  thr_bar->depth = depth;
+  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
+  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
+}
+
+#if KMP_AFFINITY_SUPPORTED
+
+bool KMPAffinity::picked_api = false;
+
+void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
+void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
+void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
+void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
+void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
+void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
+
+void KMPAffinity::pick_api() {
+  KMPAffinity *affinity_dispatch;
+  if (picked_api)
+    return;
+#if KMP_USE_HWLOC
+  // Only use Hwloc if affinity isn't explicitly disabled and
+  // user requests Hwloc topology method
+  if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
+      __kmp_affinity_type != affinity_disabled) {
+    affinity_dispatch = new KMPHwlocAffinity();
+  } else
+#endif
+  {
+    affinity_dispatch = new KMPNativeAffinity();
+  }
+  __kmp_affinity_dispatch = affinity_dispatch;
+  picked_api = true;
+}
+
+void KMPAffinity::destroy_api() {
+  if (__kmp_affinity_dispatch != NULL) {
+    delete __kmp_affinity_dispatch;
+    __kmp_affinity_dispatch = NULL;
+    picked_api = false;
+  }
+}
+
+#define KMP_ADVANCE_SCAN(scan)                                                 \
+  while (*scan != '\0') {                                                      \
+    scan++;                                                                    \
+  }
+
+// Print the affinity mask to the character array in a pretty format.
+// The format is a comma separated list of non-negative integers or integer
+// ranges: e.g., 1,2,3-5,7,9-15
+// The format can also be the string "{<empty>}" if no bits are set in mask
+char *__kmp_affinity_print_mask(char *buf, int buf_len,
+                                kmp_affin_mask_t *mask) {
+  int start = 0, finish = 0, previous = 0;
+  bool first_range;
+  KMP_ASSERT(buf);
+  KMP_ASSERT(buf_len >= 40);
+  KMP_ASSERT(mask);
+  char *scan = buf;
+  char *end = buf + buf_len - 1;
+
+  // Check for empty set.
+  if (mask->begin() == mask->end()) {
+    KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
+    KMP_ADVANCE_SCAN(scan);
+    KMP_ASSERT(scan <= end);
+    return buf;
+  }
+
+  first_range = true;
+  start = mask->begin();
+  while (1) {
+    // Find next range
+    // [start, previous] is inclusive range of contiguous bits in mask
+    for (finish = mask->next(start), previous = start;
+         finish == previous + 1 && finish != mask->end();
+         finish = mask->next(finish)) {
+      previous = finish;
+    }
+
+    // The first range does not need a comma printed before it, but the rest
+    // of the ranges do need a comma beforehand
+    if (!first_range) {
+      KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
+      KMP_ADVANCE_SCAN(scan);
+    } else {
+      first_range = false;
+    }
+    // Range with three or more contiguous bits in the affinity mask
+    if (previous - start > 1) {
+      KMP_SNPRINTF(scan, end - scan + 1, "%d-%d", static_cast<int>(start),
+                   static_cast<int>(previous));
+    } else {
+      // Range with one or two contiguous bits in the affinity mask
+      KMP_SNPRINTF(scan, end - scan + 1, "%d", static_cast<int>(start));
+      KMP_ADVANCE_SCAN(scan);
+      if (previous - start > 0) {
+        KMP_SNPRINTF(scan, end - scan + 1, ",%d", static_cast<int>(previous));
+      }
+    }
+    KMP_ADVANCE_SCAN(scan);
+    // Start over with new start point
+    start = finish;
+    if (start == mask->end())
+      break;
+    // Check for overflow
+    if (end - scan < 2)
+      break;
+  }
+
+  // Check for overflow
+  KMP_ASSERT(scan <= end);
+  return buf;
+}
+#undef KMP_ADVANCE_SCAN
+
+// Print the affinity mask to the string buffer object in a pretty format
+// The format is a comma separated list of non-negative integers or integer
+// ranges: e.g., 1,2,3-5,7,9-15
+// The format can also be the string "{<empty>}" if no bits are set in mask
+kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
+                                           kmp_affin_mask_t *mask) {
+  int start = 0, finish = 0, previous = 0;
+  bool first_range;
+  KMP_ASSERT(buf);
+  KMP_ASSERT(mask);
+
+  __kmp_str_buf_clear(buf);
+
+  // Check for empty set.
+  if (mask->begin() == mask->end()) {
+    __kmp_str_buf_print(buf, "%s", "{<empty>}");
+    return buf;
+  }
+
+  first_range = true;
+  start = mask->begin();
+  while (1) {
+    // Find next range
+    // [start, previous] is inclusive range of contiguous bits in mask
+    for (finish = mask->next(start), previous = start;
+         finish == previous + 1 && finish != mask->end();
+         finish = mask->next(finish)) {
+      previous = finish;
+    }
+
+    // The first range does not need a comma printed before it, but the rest
+    // of the ranges do need a comma beforehand
+    if (!first_range) {
+      __kmp_str_buf_print(buf, "%s", ",");
+    } else {
+      first_range = false;
+    }
+    // Range with three or more contiguous bits in the affinity mask
+    if (previous - start > 1) {
+      __kmp_str_buf_print(buf, "%d-%d", static_cast<int>(start),
+                          static_cast<int>(previous));
+    } else {
+      // Range with one or two contiguous bits in the affinity mask
+      __kmp_str_buf_print(buf, "%d", static_cast<int>(start));
+      if (previous - start > 0) {
+        __kmp_str_buf_print(buf, ",%d", static_cast<int>(previous));
+      }
+    }
+    // Start over with new start point
+    start = finish;
+    if (start == mask->end())
+      break;
+  }
+  return buf;
+}
+
+void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
+  KMP_CPU_ZERO(mask);
+
+#if KMP_GROUP_AFFINITY
+
+  if (__kmp_num_proc_groups > 1) {
+    int group;
+    KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
+    for (group = 0; group < __kmp_num_proc_groups; group++) {
+      int i;
+      int num = __kmp_GetActiveProcessorCount(group);
+      for (i = 0; i < num; i++) {
+        KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
+      }
+    }
+  } else
+
+#endif /* KMP_GROUP_AFFINITY */
+
+  {
+    int proc;
+    for (proc = 0; proc < __kmp_xproc; proc++) {
+      KMP_CPU_SET(proc, mask);
+    }
+  }
+}
+
+// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
+// called to renumber the labels from [0..n] and place them into the child_num
+// vector of the address object.  This is done in case the labels used for
+// the children at one node of the hierarchy differ from those used for
+// another node at the same level.  Example:  suppose the machine has 2 nodes
+// with 2 packages each.  The first node contains packages 601 and 602, and
+// second node contains packages 603 and 604.  If we try to sort the table
+// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
+// because we are paying attention to the labels themselves, not the ordinal
+// child numbers.  By using the child numbers in the sort, the result is
+// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
+static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
+                                             int numAddrs) {
+  KMP_DEBUG_ASSERT(numAddrs > 0);
+  int depth = address2os->first.depth;
+  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  int labCt;
+  for (labCt = 0; labCt < depth; labCt++) {
+    address2os[0].first.childNums[labCt] = counts[labCt] = 0;
+    lastLabel[labCt] = address2os[0].first.labels[labCt];
+  }
+  int i;
+  for (i = 1; i < numAddrs; i++) {
+    for (labCt = 0; labCt < depth; labCt++) {
+      if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
+        int labCt2;
+        for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
+          counts[labCt2] = 0;
+          lastLabel[labCt2] = address2os[i].first.labels[labCt2];
+        }
+        counts[labCt]++;
+        lastLabel[labCt] = address2os[i].first.labels[labCt];
+        break;
+      }
+    }
+    for (labCt = 0; labCt < depth; labCt++) {
+      address2os[i].first.childNums[labCt] = counts[labCt];
+    }
+    for (; labCt < (int)Address::maxDepth; labCt++) {
+      address2os[i].first.childNums[labCt] = 0;
+    }
+  }
+  __kmp_free(lastLabel);
+  __kmp_free(counts);
+}
+
+// All of the __kmp_affinity_create_*_map() routines should set
+// __kmp_affinity_masks to a vector of affinity mask objects of length
+// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
+// the number of levels in the machine topology tree (zero if
+// __kmp_affinity_type == affinity_none).
+//
+// All of the __kmp_affinity_create_*_map() routines should set
+// *__kmp_affin_fullMask to the affinity mask for the initialization thread.
+// They need to save and restore the mask, and it could be needed later, so
+// saving it is just an optimization to avoid calling kmp_get_system_affinity()
+// again.
+kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
+
+static int nCoresPerPkg, nPackages;
+static int __kmp_nThreadsPerCore;
+#ifndef KMP_DFLT_NTH_CORES
+static int __kmp_ncores;
+#endif
+static int *__kmp_pu_os_idx = NULL;
+
+// __kmp_affinity_uniform_topology() doesn't work when called from
+// places which support arbitrarily many levels in the machine topology
+// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
+// __kmp_affinity_create_x2apicid_map().
+inline static bool __kmp_affinity_uniform_topology() {
+  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
+}
+
+// Print out the detailed machine topology map, i.e. the physical locations
+// of each OS proc.
+static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
+                                          int depth, int pkgLevel,
+                                          int coreLevel, int threadLevel) {
+  int proc;
+
+  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
+  for (proc = 0; proc < len; proc++) {
+    int level;
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    for (level = 0; level < depth; level++) {
+      if (level == threadLevel) {
+        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
+      } else if (level == coreLevel) {
+        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
+      } else if (level == pkgLevel) {
+        __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
+      } else if (level > pkgLevel) {
+        __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
+                            level - pkgLevel - 1);
+      } else {
+        __kmp_str_buf_print(&buf, "L%d ", level);
+      }
+      __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
+    }
+    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
+               buf.str);
+    __kmp_str_buf_free(&buf);
+  }
+}
+
+#if KMP_USE_HWLOC
+
+static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
+                                          int depth, int *levels) {
+  int proc;
+  kmp_str_buf_t buf;
+  __kmp_str_buf_init(&buf);
+  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
+  for (proc = 0; proc < len; proc++) {
+    __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
+                        addrP[proc].first.labels[0]);
+    if (depth > 1) {
+      int level = 1; // iterate over levels
+      int label = 1; // iterate over labels
+      if (__kmp_numa_detected)
+        // node level follows package
+        if (levels[level++] > 0)
+          __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
+                              addrP[proc].first.labels[label++]);
+      if (__kmp_tile_depth > 0)
+        // tile level follows node if any, or package
+        if (levels[level++] > 0)
+          __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
+                              addrP[proc].first.labels[label++]);
+      if (levels[level++] > 0)
+        // core level follows
+        __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
+                            addrP[proc].first.labels[label++]);
+      if (levels[level++] > 0)
+        // thread level is the latest
+        __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
+                            addrP[proc].first.labels[label++]);
+      KMP_DEBUG_ASSERT(label == depth);
+    }
+    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
+    __kmp_str_buf_clear(&buf);
+  }
+  __kmp_str_buf_free(&buf);
+}
+
+static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
+
+// This function removes the topology levels that are radix 1 and don't offer
+// further information about the topology.  The most common example is when you
+// have one thread context per core, we don't want the extra thread context
+// level if it offers no unique labels.  So they are removed.
+// return value: the new depth of address2os
+static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
+                                                  int depth, int *levels) {
+  int level;
+  int i;
+  int radix1_detected;
+  int new_depth = depth;
+  for (level = depth - 1; level > 0; --level) {
+    // Detect if this level is radix 1
+    radix1_detected = 1;
+    for (i = 1; i < nTh; ++i) {
+      if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
+        // There are differing label values for this level so it stays
+        radix1_detected = 0;
+        break;
+      }
+    }
+    if (!radix1_detected)
+      continue;
+    // Radix 1 was detected
+    --new_depth;
+    levels[level] = -1; // mark level as not present in address2os array
+    if (level == new_depth) {
+      // "turn off" deepest level, just decrement the depth that removes
+      // the level from address2os array
+      for (i = 0; i < nTh; ++i) {
+        addrP[i].first.depth--;
+      }
+    } else {
+      // For other levels, we move labels over and also reduce the depth
+      int j;
+      for (j = level; j < new_depth; ++j) {
+        for (i = 0; i < nTh; ++i) {
+          addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
+          addrP[i].first.depth--;
+        }
+        levels[j + 1] -= 1;
+      }
+    }
+  }
+  return new_depth;
+}
+
+// Returns the number of objects of type 'type' below 'obj' within the topology
+// tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
+// HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
+// object.
+static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
+                                           hwloc_obj_type_t type) {
+  int retval = 0;
+  hwloc_obj_t first;
+  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
+                                           obj->logical_index, type, 0);
+       first != NULL &&
+       hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
+           obj;
+       first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
+                                          first)) {
+    ++retval;
+  }
+  return retval;
+}
+
+static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
+                                               hwloc_obj_t o,
+                                               kmp_hwloc_depth_t depth,
+                                               hwloc_obj_t *f) {
+  if (o->depth == depth) {
+    if (*f == NULL)
+      *f = o; // output first descendant found
+    return 1;
+  }
+  int sum = 0;
+  for (unsigned i = 0; i < o->arity; i++)
+    sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
+  return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
+                                              hwloc_obj_type_t type,
+                                              hwloc_obj_t *f) {
+  if (!hwloc_compare_types(o->type, type)) {
+    if (*f == NULL)
+      *f = o; // output first descendant found
+    return 1;
+  }
+  int sum = 0;
+  for (unsigned i = 0; i < o->arity; i++)
+    sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
+  return sum; // will be 0 if no one found (as PU arity is 0)
+}
+
+static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
+                                           int &nActiveThreads,
+                                           int &num_active_cores,
+                                           hwloc_obj_t obj, int depth,
+                                           int *labels) {
+  hwloc_obj_t core = NULL;
+  hwloc_topology_t &tp = __kmp_hwloc_topology;
+  int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
+  for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
+    hwloc_obj_t pu = NULL;
+    KMP_DEBUG_ASSERT(core != NULL);
+    int num_active_threads = 0;
+    int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
+    // int NT = core->arity; pu = core->first_child; // faster?
+    for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
+      KMP_DEBUG_ASSERT(pu != NULL);
+      if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
+        continue; // skip inactive (inaccessible) unit
+      Address addr(depth + 2);
+      KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
+                    obj->os_index, obj->logical_index, core->os_index,
+                    core->logical_index, pu->os_index, pu->logical_index));
+      for (int i = 0; i < depth; ++i)
+        addr.labels[i] = labels[i]; // package, etc.
+      addr.labels[depth] = core_id; // core
+      addr.labels[depth + 1] = pu_id; // pu
+      addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
+      __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
+      nActiveThreads++;
+      ++num_active_threads; // count active threads per core
+    }
+    if (num_active_threads) { // were there any active threads on the core?
+      ++__kmp_ncores; // count total active cores
+      ++num_active_cores; // count active cores per socket
+      if (num_active_threads > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = num_active_threads; // calc maximum
+    }
+  }
+  return 0;
+}
+
+// Check if NUMA node detected below the package,
+// and if tile object is detected and return its depth
+static int __kmp_hwloc_check_numa() {
+  hwloc_topology_t &tp = __kmp_hwloc_topology;
+  hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
+  int depth, l2cache_depth, package_depth;
+
+  // Get some PU
+  hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
+  if (hT == NULL) // something has gone wrong
+    return 1;
+
+  // check NUMA node below PACKAGE
+  hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
+  hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
+  KMP_DEBUG_ASSERT(hS != NULL);
+  if (hN != NULL && hN->depth > hS->depth) {
+    __kmp_numa_detected = TRUE; // socket includes node(s)
+    if (__kmp_affinity_gran == affinity_gran_node) {
+      __kmp_affinity_gran = affinity_gran_numa;
+    }
+  }
+
+  package_depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PACKAGE);
+  l2cache_depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
+  // check tile, get object by depth because of multiple caches possible
+  depth = (l2cache_depth < package_depth) ? package_depth : l2cache_depth;
+  hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
+  hC = NULL; // not used, but reset it here just in case
+  if (hL != NULL &&
+      __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
+    __kmp_tile_depth = depth; // tile consists of multiple cores
+  return 0;
+}
+
+static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
+                                           kmp_i18n_id_t *const msg_id) {
+  hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Save the affinity mask for the current thread.
+  kmp_affin_mask_t *oldMask;
+  KMP_CPU_ALLOC(oldMask);
+  __kmp_get_system_affinity(oldMask, TRUE);
+  __kmp_hwloc_check_numa();
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+    nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
+        hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
+    __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
+        hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
+    __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (__kmp_affinity_uniform_topology()) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+    KMP_CPU_FREE(oldMask);
+    return 0;
+  }
+
+  int depth = 3;
+  int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
+  int labels[3] = {0}; // package [,node] [,tile] - head of lables array
+  if (__kmp_numa_detected)
+    ++depth;
+  if (__kmp_tile_depth)
+    ++depth;
+
+  // Allocate the data structure to be returned.
+  AddrUnsPair *retval =
+      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
+  // nCoresPerPkg, & nPackages.  Make sure all these vars are set
+  // correctly, and return if affinity is not enabled.
+
+  hwloc_obj_t socket, node, tile;
+  int nActiveThreads = 0;
+  int socket_id = 0;
+  // re-calculate globals to count only accessible resources
+  __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
+  nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
+  for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
+       socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
+      socket_id++) {
+    labels[0] = socket_id;
+    if (__kmp_numa_detected) {
+      int NN;
+      int n_active_nodes = 0;
+      node = NULL;
+      NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
+                                              &node);
+      for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
+        labels[1] = node_id;
+        if (__kmp_tile_depth) {
+          // NUMA + tiles
+          int NT;
+          int n_active_tiles = 0;
+          tile = NULL;
+          NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
+                                                   &tile);
+          for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
+            labels[2] = tl_id;
+            int n_active_cores = 0;
+            __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+                                            n_active_cores, tile, 3, labels);
+            if (n_active_cores) { // were there any active cores on the socket?
+              ++n_active_tiles; // count active tiles per node
+              if (n_active_cores > nCorePerTile)
+                nCorePerTile = n_active_cores; // calc maximum
+            }
+          }
+          if (n_active_tiles) { // were there any active tiles on the socket?
+            ++n_active_nodes; // count active nodes per package
+            if (n_active_tiles > nTilePerNode)
+              nTilePerNode = n_active_tiles; // calc maximum
+          }
+        } else {
+          // NUMA, no tiles
+          int n_active_cores = 0;
+          __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+                                          n_active_cores, node, 2, labels);
+          if (n_active_cores) { // were there any active cores on the socket?
+            ++n_active_nodes; // count active nodes per package
+            if (n_active_cores > nCorePerNode)
+              nCorePerNode = n_active_cores; // calc maximum
+          }
+        }
+      }
+      if (n_active_nodes) { // were there any active nodes on the socket?
+        ++nPackages; // count total active packages
+        if (n_active_nodes > nNodePerPkg)
+          nNodePerPkg = n_active_nodes; // calc maximum
+      }
+    } else {
+      if (__kmp_tile_depth) {
+        // no NUMA, tiles
+        int NT;
+        int n_active_tiles = 0;
+        tile = NULL;
+        NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
+                                                 &tile);
+        for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
+          labels[1] = tl_id;
+          int n_active_cores = 0;
+          __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
+                                          n_active_cores, tile, 2, labels);
+          if (n_active_cores) { // were there any active cores on the socket?
+            ++n_active_tiles; // count active tiles per package
+            if (n_active_cores > nCorePerTile)
+              nCorePerTile = n_active_cores; // calc maximum
+          }
+        }
+        if (n_active_tiles) { // were there any active tiles on the socket?
+          ++nPackages; // count total active packages
+          if (n_active_tiles > nTilePerPkg)
+            nTilePerPkg = n_active_tiles; // calc maximum
+        }
+      } else {
+        // no NUMA, no tiles
+        int n_active_cores = 0;
+        __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
+                                        socket, 1, labels);
+        if (n_active_cores) { // were there any active cores on the socket?
+          ++nPackages; // count total active packages
+          if (n_active_cores > nCoresPerPkg)
+            nCoresPerPkg = n_active_cores; // calc maximum
+        }
+      }
+    }
+  }
+
+  // If there's only one thread context to bind to, return now.
+  KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
+  KMP_ASSERT(nActiveThreads > 0);
+  if (nActiveThreads == 1) {
+    __kmp_ncores = nPackages = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+
+    if (__kmp_affinity_type == affinity_none) {
+      __kmp_free(retval);
+      KMP_CPU_FREE(oldMask);
+      return 0;
+    }
+
+    // Form an Address object which only includes the package level.
+    Address addr(1);
+    addr.labels[0] = retval[0].first.labels[0];
+    retval[0].first = addr;
+
+    if (__kmp_affinity_gran_levels < 0) {
+      __kmp_affinity_gran_levels = 0;
+    }
+
+    if (__kmp_affinity_verbose) {
+      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
+    }
+
+    *address2os = retval;
+    KMP_CPU_FREE(oldMask);
+    return 1;
+  }
+
+  // Sort the table by physical Id.
+  qsort(retval, nActiveThreads, sizeof(*retval),
+        __kmp_affinity_cmp_Address_labels);
+
+  // Check to see if the machine topology is uniform
+  int nPUs = nPackages * __kmp_nThreadsPerCore;
+  if (__kmp_numa_detected) {
+    if (__kmp_tile_depth) { // NUMA + tiles
+      nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
+    } else { // NUMA, no tiles
+      nPUs *= (nNodePerPkg * nCorePerNode);
+    }
+  } else {
+    if (__kmp_tile_depth) { // no NUMA, tiles
+      nPUs *= (nTilePerPkg * nCorePerTile);
+    } else { // no NUMA, no tiles
+      nPUs *= nCoresPerPkg;
+    }
+  }
+  unsigned uniform = (nPUs == nActiveThreads);
+
+  // Print the machine topology summary.
+  if (__kmp_affinity_verbose) {
+    char mask[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
+    }
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    if (uniform) {
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+    } else {
+      KMP_INFORM(NonUniform, "KMP_AFFINITY");
+    }
+    if (__kmp_numa_detected) {
+      if (__kmp_tile_depth) { // NUMA + tiles
+        KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
+                   nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
+                   __kmp_ncores);
+      } else { // NUMA, no tiles
+        KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
+                   nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
+        nPUs *= (nNodePerPkg * nCorePerNode);
+      }
+    } else {
+      if (__kmp_tile_depth) { // no NUMA, tiles
+        KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
+                   nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
+      } else { // no NUMA, no tiles
+        kmp_str_buf_t buf;
+        __kmp_str_buf_init(&buf);
+        __kmp_str_buf_print(&buf, "%d", nPackages);
+        KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+                   __kmp_nThreadsPerCore, __kmp_ncores);
+        __kmp_str_buf_free(&buf);
+      }
+    }
+  }
+
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_free(retval);
+    KMP_CPU_FREE(oldMask);
+    return 0;
+  }
+
+  int depth_full = depth; // number of levels before compressing
+  // Find any levels with radiix 1, and remove them from the map
+  // (except for the package level).
+  depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
+                                                 levels);
+  KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled
+    // in the machine topology map.
+    __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
+    if (__kmp_affinity_gran > affinity_gran_thread) {
+      for (int i = 1; i <= depth_full; ++i) {
+        if (__kmp_affinity_gran <= i) // only count deeper levels
+          break;
+        if (levels[depth_full - i] > 0)
+          __kmp_affinity_gran_levels++;
+      }
+    }
+    if (__kmp_affinity_gran > affinity_gran_package)
+      __kmp_affinity_gran_levels++; // e.g. granularity = group
+  }
+
+  if (__kmp_affinity_verbose)
+    __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
+
+  KMP_CPU_FREE(oldMask);
+  *address2os = retval;
+  return depth;
+}
+#endif // KMP_USE_HWLOC
+
+// If we don't know how to retrieve the machine's processor topology, or
+// encounter an error in doing so, this routine is called to form a "flat"
+// mapping of os thread id's <-> processor id's.
+static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
+                                          kmp_i18n_id_t *const msg_id) {
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Even if __kmp_affinity_type == affinity_none, this routine might still
+  // called to set __kmp_ncores, as well as
+  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+    __kmp_ncores = nPackages = __kmp_xproc;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+    return 0;
+  }
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = nPackages = __kmp_avail_proc;
+  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+  if (__kmp_affinity_verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              __kmp_affin_fullMask);
+
+    KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+    }
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    KMP_INFORM(Uniform, "KMP_AFFINITY");
+    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+  }
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  if (__kmp_affinity_type == affinity_none) {
+    int avail_ct = 0;
+    int i;
+    KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+      if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
+        continue;
+      __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
+    }
+    return 0;
+  }
+
+  // Contruct the data structure to be returned.
+  *address2os =
+      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+  int avail_ct = 0;
+  int i;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
+    Address addr(1);
+    addr.labels[0] = i;
+    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
+  }
+  if (__kmp_affinity_verbose) {
+    KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
+  }
+
+  if (__kmp_affinity_gran_levels < 0) {
+    // Only the package level is modeled in the machine topology map,
+    // so the #levels of granularity is either 0 or 1.
+    if (__kmp_affinity_gran > affinity_gran_package) {
+      __kmp_affinity_gran_levels = 1;
+    } else {
+      __kmp_affinity_gran_levels = 0;
+    }
+  }
+  return 1;
+}
+
+#if KMP_GROUP_AFFINITY
+
+// If multiple Windows* OS processor groups exist, we can create a 2-level
+// topology map with the groups at level 0 and the individual procs at level 1.
+// This facilitates letting the threads float among all procs in a group,
+// if granularity=group (the default when there are multiple groups).
+static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
+                                                kmp_i18n_id_t *const msg_id) {
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // If we aren't affinity capable, then return now.
+  // The flat mapping will be used.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // FIXME set *msg_id
+    return -1;
+  }
+
+  // Contruct the data structure to be returned.
+  *address2os =
+      (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  int avail_ct = 0;
+  int i;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
+    Address addr(2);
+    addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
+    addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
+    (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
+
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
+                 addr.labels[1]);
+    }
+  }
+
+  if (__kmp_affinity_gran_levels < 0) {
+    if (__kmp_affinity_gran == affinity_gran_group) {
+      __kmp_affinity_gran_levels = 1;
+    } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
+               (__kmp_affinity_gran == affinity_gran_thread)) {
+      __kmp_affinity_gran_levels = 0;
+    } else {
+      const char *gran_str = NULL;
+      if (__kmp_affinity_gran == affinity_gran_core) {
+        gran_str = "core";
+      } else if (__kmp_affinity_gran == affinity_gran_package) {
+        gran_str = "package";
+      } else if (__kmp_affinity_gran == affinity_gran_node) {
+        gran_str = "node";
+      } else {
+        KMP_ASSERT(0);
+      }
+
+      // Warning: can't use affinity granularity \"gran\" with group topology
+      // method, using "thread"
+      __kmp_affinity_gran_levels = 0;
+    }
+  }
+  return 2;
+}
+
+#endif /* KMP_GROUP_AFFINITY */
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+static int __kmp_cpuid_mask_width(int count) {
+  int r = 0;
+
+  while ((1 << r) < count)
+    ++r;
+  return r;
+}
+
+class apicThreadInfo {
+public:
+  unsigned osId; // param to __kmp_affinity_bind_thread
+  unsigned apicId; // from cpuid after binding
+  unsigned maxCoresPerPkg; //      ""
+  unsigned maxThreadsPerPkg; //      ""
+  unsigned pkgId; // inferred from above values
+  unsigned coreId; //      ""
+  unsigned threadId; //      ""
+};
+
+static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
+                                                     const void *b) {
+  const apicThreadInfo *aa = (const apicThreadInfo *)a;
+  const apicThreadInfo *bb = (const apicThreadInfo *)b;
+  if (aa->pkgId < bb->pkgId)
+    return -1;
+  if (aa->pkgId > bb->pkgId)
+    return 1;
+  if (aa->coreId < bb->coreId)
+    return -1;
+  if (aa->coreId > bb->coreId)
+    return 1;
+  if (aa->threadId < bb->threadId)
+    return -1;
+  if (aa->threadId > bb->threadId)
+    return 1;
+  return 0;
+}
+
+// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
+// an algorithm which cycles through the available os threads, setting
+// the current thread's affinity mask to that thread, and then retrieves
+// the Apic Id for each thread context using the cpuid instruction.
+static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
+                                            kmp_i18n_id_t *const msg_id) {
+  kmp_cpuid buf;
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Check if cpuid leaf 4 is supported.
+  __kmp_x86_cpuid(0, 0, &buf);
+  if (buf.eax < 4) {
+    *msg_id = kmp_i18n_str_NoLeaf4Support;
+    return -1;
+  }
+
+  // The algorithm used starts by setting the affinity to each available thread
+  // and retrieving info from the cpuid instruction, so if we are not capable of
+  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
+  // need to do something else - use the defaults that we calculated from
+  // issuing cpuid without binding to each proc.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+    // Get an upper bound on the number of threads per package using cpuid(1).
+    // On some OS/chps combinations where HT is supported by the chip but is
+    // disabled, this value will be 2 on a single core chip. Usually, it will be
+    // 2 if HT is enabled and 1 if HT is disabled.
+    __kmp_x86_cpuid(1, 0, &buf);
+    int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+    if (maxThreadsPerPkg == 0) {
+      maxThreadsPerPkg = 1;
+    }
+
+    // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
+    // value.
+    //
+    // The author of cpu_count.cpp treated this only an upper bound on the
+    // number of cores, but I haven't seen any cases where it was greater than
+    // the actual number of cores, so we will treat it as exact in this block of
+    // code.
+    //
+    // First, we need to check if cpuid(4) is supported on this chip. To see if
+    // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
+    // greater.
+    __kmp_x86_cpuid(0, 0, &buf);
+    if (buf.eax >= 4) {
+      __kmp_x86_cpuid(4, 0, &buf);
+      nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+    } else {
+      nCoresPerPkg = 1;
+    }
+
+    // There is no way to reliably tell if HT is enabled without issuing the
+    // cpuid instruction from every thread, can correlating the cpuid info, so
+    // if the machine is not affinity capable, we assume that HT is off. We have
+    // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
+    // does not support HT.
+    //
+    // - Older OSes are usually found on machines with older chips, which do not
+    //   support HT.
+    // - The performance penalty for mistakenly identifying a machine as HT when
+    //   it isn't (which results in blocktime being incorrecly set to 0) is
+    //   greater than the penalty when for mistakenly identifying a machine as
+    //   being 1 thread/core when it is really HT enabled (which results in
+    //   blocktime being incorrectly set to a positive value).
+    __kmp_ncores = __kmp_xproc;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    __kmp_nThreadsPerCore = 1;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (__kmp_affinity_uniform_topology()) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+    return 0;
+  }
+
+  // From here on, we can assume that it is safe to call
+  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
+  // __kmp_affinity_type = affinity_none.
+
+  // Save the affinity mask for the current thread.
+  kmp_affin_mask_t *oldMask;
+  KMP_CPU_ALLOC(oldMask);
+  KMP_ASSERT(oldMask != NULL);
+  __kmp_get_system_affinity(oldMask, TRUE);
+
+  // Run through each of the available contexts, binding the current thread
+  // to it, and obtaining the pertinent information using the cpuid instr.
+  //
+  // The relevant information is:
+  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
+  //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
+  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
+  //     of this field determines the width of the core# + thread# fields in the
+  //     Apic Id. It is also an upper bound on the number of threads per
+  //     package, but it has been verified that situations happen were it is not
+  //     exact. In particular, on certain OS/chip combinations where Intel(R)
+  //     Hyper-Threading Technology is supported by the chip but has been
+  //     disabled, the value of this field will be 2 (for a single core chip).
+  //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
+  //     Technology, the value of this field will be 1 when Intel(R)
+  //     Hyper-Threading Technology is disabled and 2 when it is enabled.
+  // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
+  //     of this field (+1) determines the width of the core# field in the Apic
+  //     Id. The comments in "cpucount.cpp" say that this value is an upper
+  //     bound, but the IA-32 architecture manual says that it is exactly the
+  //     number of cores per package, and I haven't seen any case where it
+  //     wasn't.
+  //
+  // From this information, deduce the package Id, core Id, and thread Id,
+  // and set the corresponding fields in the apicThreadInfo struct.
+  unsigned i;
+  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
+      __kmp_avail_proc * sizeof(apicThreadInfo));
+  unsigned nApics = 0;
+  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+      continue;
+    }
+    KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
+
+    __kmp_affinity_dispatch->bind_thread(i);
+    threadInfo[nApics].osId = i;
+
+    // The apic id and max threads per pkg come from cpuid(1).
+    __kmp_x86_cpuid(1, 0, &buf);
+    if (((buf.edx >> 9) & 1) == 0) {
+      __kmp_set_system_affinity(oldMask, TRUE);
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_ApicNotPresent;
+      return -1;
+    }
+    threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
+    threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
+    if (threadInfo[nApics].maxThreadsPerPkg == 0) {
+      threadInfo[nApics].maxThreadsPerPkg = 1;
+    }
+
+    // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
+    // value.
+    //
+    // First, we need to check if cpuid(4) is supported on this chip. To see if
+    // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
+    // or greater.
+    __kmp_x86_cpuid(0, 0, &buf);
+    if (buf.eax >= 4) {
+      __kmp_x86_cpuid(4, 0, &buf);
+      threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
+    } else {
+      threadInfo[nApics].maxCoresPerPkg = 1;
+    }
+
+    // Infer the pkgId / coreId / threadId using only the info obtained locally.
+    int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
+    threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
+
+    int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
+    int widthT = widthCT - widthC;
+    if (widthT < 0) {
+      // I've never seen this one happen, but I suppose it could, if the cpuid
+      // instruction on a chip was really screwed up. Make sure to restore the
+      // affinity mask before the tail call.
+      __kmp_set_system_affinity(oldMask, TRUE);
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      return -1;
+    }
+
+    int maskC = (1 << widthC) - 1;
+    threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
+
+    int maskT = (1 << widthT) - 1;
+    threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
+
+    nApics++;
+  }
+
+  // We've collected all the info we need.
+  // Restore the old affinity mask for this thread.
+  __kmp_set_system_affinity(oldMask, TRUE);
+
+  // If there's only one thread context to bind to, form an Address object
+  // with depth 1 and return immediately (or, if affinity is off, set
+  // address2os to NULL and return).
+  //
+  // If it is configured to omit the package level when there is only a single
+  // package, the logic at the end of this routine won't work if there is only
+  // a single thread - it would try to form an Address object with depth 0.
+  KMP_ASSERT(nApics > 0);
+  if (nApics == 1) {
+    __kmp_ncores = nPackages = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+      KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+
+    if (__kmp_affinity_type == affinity_none) {
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      return 0;
+    }
+
+    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
+    Address addr(1);
+    addr.labels[0] = threadInfo[0].pkgId;
+    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
+
+    if (__kmp_affinity_gran_levels < 0) {
+      __kmp_affinity_gran_levels = 0;
+    }
+
+    if (__kmp_affinity_verbose) {
+      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
+    }
+
+    __kmp_free(threadInfo);
+    KMP_CPU_FREE(oldMask);
+    return 1;
+  }
+
+  // Sort the threadInfo table by physical Id.
+  qsort(threadInfo, nApics, sizeof(*threadInfo),
+        __kmp_affinity_cmp_apicThreadInfo_phys_id);
+
+  // The table is now sorted by pkgId / coreId / threadId, but we really don't
+  // know the radix of any of the fields. pkgId's may be sparsely assigned among
+  // the chips on a system. Although coreId's are usually assigned
+  // [0 .. coresPerPkg-1] and threadId's are usually assigned
+  // [0..threadsPerCore-1], we don't want to make any such assumptions.
+  //
+  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
+  // total # packages) are at this point - we want to determine that now. We
+  // only have an upper bound on the first two figures.
+  //
+  // We also perform a consistency check at this point: the values returned by
+  // the cpuid instruction for any thread bound to a given package had better
+  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
+  nPackages = 1;
+  nCoresPerPkg = 1;
+  __kmp_nThreadsPerCore = 1;
+  unsigned nCores = 1;
+
+  unsigned pkgCt = 1; // to determine radii
+  unsigned lastPkgId = threadInfo[0].pkgId;
+  unsigned coreCt = 1;
+  unsigned lastCoreId = threadInfo[0].coreId;
+  unsigned threadCt = 1;
+  unsigned lastThreadId = threadInfo[0].threadId;
+
+  // intra-pkg consist checks
+  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
+  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
+
+  for (i = 1; i < nApics; i++) {
+    if (threadInfo[i].pkgId != lastPkgId) {
+      nCores++;
+      pkgCt++;
+      lastPkgId = threadInfo[i].pkgId;
+      if ((int)coreCt > nCoresPerPkg)
+        nCoresPerPkg = coreCt;
+      coreCt = 1;
+      lastCoreId = threadInfo[i].coreId;
+      if ((int)threadCt > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = threadCt;
+      threadCt = 1;
+      lastThreadId = threadInfo[i].threadId;
+
+      // This is a different package, so go on to the next iteration without
+      // doing any consistency checks. Reset the consistency check vars, though.
+      prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
+      prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
+      continue;
+    }
+
+    if (threadInfo[i].coreId != lastCoreId) {
+      nCores++;
+      coreCt++;
+      lastCoreId = threadInfo[i].coreId;
+      if ((int)threadCt > __kmp_nThreadsPerCore)
+        __kmp_nThreadsPerCore = threadCt;
+      threadCt = 1;
+      lastThreadId = threadInfo[i].threadId;
+    } else if (threadInfo[i].threadId != lastThreadId) {
+      threadCt++;
+      lastThreadId = threadInfo[i].threadId;
+    } else {
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
+      return -1;
+    }
+
+    // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
+    // fields agree between all the threads bounds to a given package.
+    if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
+        (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
+      __kmp_free(threadInfo);
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+      return -1;
+    }
+  }
+  nPackages = pkgCt;
+  if ((int)coreCt > nCoresPerPkg)
+    nCoresPerPkg = coreCt;
+  if ((int)threadCt > __kmp_nThreadsPerCore)
+    __kmp_nThreadsPerCore = threadCt;
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = nCores;
+  if (__kmp_affinity_verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+    KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+    }
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    if (__kmp_affinity_uniform_topology()) {
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+    } else {
+      KMP_INFORM(NonUniform, "KMP_AFFINITY");
+    }
+    KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+  }
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  for (i = 0; i < nApics; ++i) {
+    __kmp_pu_os_idx[i] = threadInfo[i].osId;
+  }
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_free(threadInfo);
+    KMP_CPU_FREE(oldMask);
+    return 0;
+  }
+
+  // Now that we've determined the number of packages, the number of cores per
+  // package, and the number of threads per core, we can construct the data
+  // structure that is to be returned.
+  int pkgLevel = 0;
+  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
+  int threadLevel =
+      (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
+  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
+
+  KMP_ASSERT(depth > 0);
+  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
+
+  for (i = 0; i < nApics; ++i) {
+    Address addr(depth);
+    unsigned os = threadInfo[i].osId;
+    int d = 0;
+
+    if (pkgLevel >= 0) {
+      addr.labels[d++] = threadInfo[i].pkgId;
+    }
+    if (coreLevel >= 0) {
+      addr.labels[d++] = threadInfo[i].coreId;
+    }
+    if (threadLevel >= 0) {
+      addr.labels[d++] = threadInfo[i].threadId;
+    }
+    (*address2os)[i] = AddrUnsPair(addr, os);
+  }
+
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled in the machine
+    // topology map.
+    __kmp_affinity_gran_levels = 0;
+    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
+      __kmp_affinity_gran_levels++;
+    }
+  }
+
+  if (__kmp_affinity_verbose) {
+    __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
+                                  coreLevel, threadLevel);
+  }
+
+  __kmp_free(threadInfo);
+  KMP_CPU_FREE(oldMask);
+  return depth;
+}
+
+// Intel(R) microarchitecture code name Nehalem, Dunnington and later
+// architectures support a newer interface for specifying the x2APIC Ids,
+// based on cpuid leaf 11.
+static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
+                                              kmp_i18n_id_t *const msg_id) {
+  kmp_cpuid buf;
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Check to see if cpuid leaf 11 is supported.
+  __kmp_x86_cpuid(0, 0, &buf);
+  if (buf.eax < 11) {
+    *msg_id = kmp_i18n_str_NoLeaf11Support;
+    return -1;
+  }
+  __kmp_x86_cpuid(11, 0, &buf);
+  if (buf.ebx == 0) {
+    *msg_id = kmp_i18n_str_NoLeaf11Support;
+    return -1;
+  }
+
+  // Find the number of levels in the machine topology. While we're at it, get
+  // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
+  // get more accurate values later by explicitly counting them, but get
+  // reasonable defaults now, in case we return early.
+  int level;
+  int threadLevel = -1;
+  int coreLevel = -1;
+  int pkgLevel = -1;
+  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+
+  for (level = 0;; level++) {
+    if (level > 31) {
+      // FIXME: Hack for DPD200163180
+      //
+      // If level is big then something went wrong -> exiting
+      //
+      // There could actually be 32 valid levels in the machine topology, but so
+      // far, the only machine we have seen which does not exit this loop before
+      // iteration 32 has fubar x2APIC settings.
+      //
+      // For now, just reject this case based upon loop trip count.
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+      return -1;
+    }
+    __kmp_x86_cpuid(11, level, &buf);
+    if (buf.ebx == 0) {
+      if (pkgLevel < 0) {
+        // Will infer nPackages from __kmp_xproc
+        pkgLevel = level;
+        level++;
+      }
+      break;
+    }
+    int kind = (buf.ecx >> 8) & 0xff;
+    if (kind == 1) {
+      // SMT level
+      threadLevel = level;
+      coreLevel = -1;
+      pkgLevel = -1;
+      __kmp_nThreadsPerCore = buf.ebx & 0xffff;
+      if (__kmp_nThreadsPerCore == 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+    } else if (kind == 2) {
+      // core level
+      coreLevel = level;
+      pkgLevel = -1;
+      nCoresPerPkg = buf.ebx & 0xffff;
+      if (nCoresPerPkg == 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+    } else {
+      if (level <= 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+      if (pkgLevel >= 0) {
+        continue;
+      }
+      pkgLevel = level;
+      nPackages = buf.ebx & 0xffff;
+      if (nPackages == 0) {
+        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
+        return -1;
+      }
+    }
+  }
+  int depth = level;
+
+  // In the above loop, "level" was counted from the finest level (usually
+  // thread) to the coarsest.  The caller expects that we will place the labels
+  // in (*address2os)[].first.labels[] in the inverse order, so we need to
+  // invert the vars saying which level means what.
+  if (threadLevel >= 0) {
+    threadLevel = depth - threadLevel - 1;
+  }
+  if (coreLevel >= 0) {
+    coreLevel = depth - coreLevel - 1;
+  }
+  KMP_DEBUG_ASSERT(pkgLevel >= 0);
+  pkgLevel = depth - pkgLevel - 1;
+
+  // The algorithm used starts by setting the affinity to each available thread
+  // and retrieving info from the cpuid instruction, so if we are not capable of
+  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
+  // need to do something else - use the defaults that we calculated from
+  // issuing cpuid without binding to each proc.
+  if (!KMP_AFFINITY_CAPABLE()) {
+    // Hack to try and infer the machine topology using only the data
+    // available from cpuid on the current thread, and __kmp_xproc.
+    KMP_ASSERT(__kmp_affinity_type == affinity_none);
+
+    __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
+    nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (__kmp_affinity_uniform_topology()) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+    return 0;
+  }
+
+  // From here on, we can assume that it is safe to call
+  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
+  // __kmp_affinity_type = affinity_none.
+
+  // Save the affinity mask for the current thread.
+  kmp_affin_mask_t *oldMask;
+  KMP_CPU_ALLOC(oldMask);
+  __kmp_get_system_affinity(oldMask, TRUE);
+
+  // Allocate the data structure to be returned.
+  AddrUnsPair *retval =
+      (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
+
+  // Run through each of the available contexts, binding the current thread
+  // to it, and obtaining the pertinent information using the cpuid instr.
+  unsigned int proc;
+  int nApics = 0;
+  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
+    // Skip this proc if it is not included in the machine model.
+    if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+      continue;
+    }
+    KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
+
+    __kmp_affinity_dispatch->bind_thread(proc);
+
+    // Extract labels for each level in the machine topology map from Apic ID.
+    Address addr(depth);
+    int prev_shift = 0;
+
+    for (level = 0; level < depth; level++) {
+      __kmp_x86_cpuid(11, level, &buf);
+      unsigned apicId = buf.edx;
+      if (buf.ebx == 0) {
+        if (level != depth - 1) {
+          KMP_CPU_FREE(oldMask);
+          *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+          return -1;
+        }
+        addr.labels[depth - level - 1] = apicId >> prev_shift;
+        level++;
+        break;
+      }
+      int shift = buf.eax & 0x1f;
+      int mask = (1 << shift) - 1;
+      addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
+      prev_shift = shift;
+    }
+    if (level != depth) {
+      KMP_CPU_FREE(oldMask);
+      *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+      return -1;
+    }
+
+    retval[nApics] = AddrUnsPair(addr, proc);
+    nApics++;
+  }
+
+  // We've collected all the info we need.
+  // Restore the old affinity mask for this thread.
+  __kmp_set_system_affinity(oldMask, TRUE);
+
+  // If there's only one thread context to bind to, return now.
+  KMP_ASSERT(nApics > 0);
+  if (nApics == 1) {
+    __kmp_ncores = nPackages = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = 1;
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+      KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+      KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
+                 __kmp_nThreadsPerCore, __kmp_ncores);
+    }
+
+    if (__kmp_affinity_type == affinity_none) {
+      __kmp_free(retval);
+      KMP_CPU_FREE(oldMask);
+      return 0;
+    }
+
+    // Form an Address object which only includes the package level.
+    Address addr(1);
+    addr.labels[0] = retval[0].first.labels[pkgLevel];
+    retval[0].first = addr;
+
+    if (__kmp_affinity_gran_levels < 0) {
+      __kmp_affinity_gran_levels = 0;
+    }
+
+    if (__kmp_affinity_verbose) {
+      __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
+    }
+
+    *address2os = retval;
+    KMP_CPU_FREE(oldMask);
+    return 1;
+  }
+
+  // Sort the table by physical Id.
+  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
+
+  // Find the radix at each of the levels.
+  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
+  for (level = 0; level < depth; level++) {
+    totals[level] = 1;
+    maxCt[level] = 1;
+    counts[level] = 1;
+    last[level] = retval[0].first.labels[level];
+  }
+
+  // From here on, the iteration variable "level" runs from the finest level to
+  // the coarsest, i.e. we iterate forward through
+  // (*address2os)[].first.labels[] - in the previous loops, we iterated
+  // backwards.
+  for (proc = 1; (int)proc < nApics; proc++) {
+    int level;
+    for (level = 0; level < depth; level++) {
+      if (retval[proc].first.labels[level] != last[level]) {
+        int j;
+        for (j = level + 1; j < depth; j++) {
+          totals[j]++;
+          counts[j] = 1;
+          // The line below causes printing incorrect topology information in
+          // case the max value for some level (maxCt[level]) is encountered
+          // earlier than some less value while going through the array. For
+          // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
+          // maxCt[1] == 2
+          // whereas it must be 4.
+          // TODO!!! Check if it can be commented safely
+          // maxCt[j] = 1;
+          last[j] = retval[proc].first.labels[j];
+        }
+        totals[level]++;
+        counts[level]++;
+        if (counts[level] > maxCt[level]) {
+          maxCt[level] = counts[level];
+        }
+        last[level] = retval[proc].first.labels[level];
+        break;
+      } else if (level == depth - 1) {
+        __kmp_free(last);
+        __kmp_free(maxCt);
+        __kmp_free(counts);
+        __kmp_free(totals);
+        __kmp_free(retval);
+        KMP_CPU_FREE(oldMask);
+        *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
+        return -1;
+      }
+    }
+  }
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return if affinity is not
+  // enabled.
+  if (threadLevel >= 0) {
+    __kmp_nThreadsPerCore = maxCt[threadLevel];
+  } else {
+    __kmp_nThreadsPerCore = 1;
+  }
+  nPackages = totals[pkgLevel];
+
+  if (coreLevel >= 0) {
+    __kmp_ncores = totals[coreLevel];
+    nCoresPerPkg = maxCt[coreLevel];
+  } else {
+    __kmp_ncores = nPackages;
+    nCoresPerPkg = 1;
+  }
+
+  // Check to see if the machine topology is uniform
+  unsigned prod = maxCt[0];
+  for (level = 1; level < depth; level++) {
+    prod *= maxCt[level];
+  }
+  bool uniform = (prod == totals[level - 1]);
+
+  // Print the machine topology summary.
+  if (__kmp_affinity_verbose) {
+    char mask[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
+
+    KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
+    }
+    KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+    if (uniform) {
+      KMP_INFORM(Uniform, "KMP_AFFINITY");
+    } else {
+      KMP_INFORM(NonUniform, "KMP_AFFINITY");
+    }
+
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+
+    __kmp_str_buf_print(&buf, "%d", totals[0]);
+    for (level = 1; level <= pkgLevel; level++) {
+      __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
+    }
+    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+
+    __kmp_str_buf_free(&buf);
+  }
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  for (proc = 0; (int)proc < nApics; ++proc) {
+    __kmp_pu_os_idx[proc] = retval[proc].second;
+  }
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_free(last);
+    __kmp_free(maxCt);
+    __kmp_free(counts);
+    __kmp_free(totals);
+    __kmp_free(retval);
+    KMP_CPU_FREE(oldMask);
+    return 0;
+  }
+
+  // Find any levels with radiix 1, and remove them from the map
+  // (except for the package level).
+  int new_depth = 0;
+  for (level = 0; level < depth; level++) {
+    if ((maxCt[level] == 1) && (level != pkgLevel)) {
+      continue;
+    }
+    new_depth++;
+  }
+
+  // If we are removing any levels, allocate a new vector to return,
+  // and copy the relevant information to it.
+  if (new_depth != depth) {
+    AddrUnsPair *new_retval =
+        (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
+    for (proc = 0; (int)proc < nApics; proc++) {
+      Address addr(new_depth);
+      new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
+    }
+    int new_level = 0;
+    int newPkgLevel = -1;
+    int newCoreLevel = -1;
+    int newThreadLevel = -1;
+    for (level = 0; level < depth; level++) {
+      if ((maxCt[level] == 1) && (level != pkgLevel)) {
+        // Remove this level. Never remove the package level
+        continue;
+      }
+      if (level == pkgLevel) {
+        newPkgLevel = new_level;
+      }
+      if (level == coreLevel) {
+        newCoreLevel = new_level;
+      }
+      if (level == threadLevel) {
+        newThreadLevel = new_level;
+      }
+      for (proc = 0; (int)proc < nApics; proc++) {
+        new_retval[proc].first.labels[new_level] =
+            retval[proc].first.labels[level];
+      }
+      new_level++;
+    }
+
+    __kmp_free(retval);
+    retval = new_retval;
+    depth = new_depth;
+    pkgLevel = newPkgLevel;
+    coreLevel = newCoreLevel;
+    threadLevel = newThreadLevel;
+  }
+
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled
+    // in the machine topology map.
+    __kmp_affinity_gran_levels = 0;
+    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if (__kmp_affinity_gran > affinity_gran_package) {
+      __kmp_affinity_gran_levels++;
+    }
+  }
+
+  if (__kmp_affinity_verbose) {
+    __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
+                                  threadLevel);
+  }
+
+  __kmp_free(last);
+  __kmp_free(maxCt);
+  __kmp_free(counts);
+  __kmp_free(totals);
+  KMP_CPU_FREE(oldMask);
+  *address2os = retval;
+  return depth;
+}
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#define osIdIndex 0
+#define threadIdIndex 1
+#define coreIdIndex 2
+#define pkgIdIndex 3
+#define nodeIdIndex 4
+
+typedef unsigned *ProcCpuInfo;
+static unsigned maxIndex = pkgIdIndex;
+
+static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
+                                                  const void *b) {
+  unsigned i;
+  const unsigned *aa = *(unsigned *const *)a;
+  const unsigned *bb = *(unsigned *const *)b;
+  for (i = maxIndex;; i--) {
+    if (aa[i] < bb[i])
+      return -1;
+    if (aa[i] > bb[i])
+      return 1;
+    if (i == osIdIndex)
+      break;
+  }
+  return 0;
+}
+
+#if KMP_USE_HIER_SCHED
+// Set the array sizes for the hierarchy layers
+static void __kmp_dispatch_set_hierarchy_values() {
+  // Set the maximum number of L1's to number of cores
+  // Set the maximum number of L2's to to either number of cores / 2 for
+  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
+  // Or the number of cores for Intel(R) Xeon(R) processors
+  // Set the maximum number of NUMA nodes and L3's to number of packages
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
+  // Set the number of threads per unit
+  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
+      __kmp_nThreadsPerCore;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        2 * __kmp_nThreadsPerCore;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+}
+
+// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
+// i.e., this thread's L1 or this thread's L2, etc.
+int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
+  int index = type + 1;
+  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
+  if (type == kmp_hier_layer_e::LAYER_THREAD)
+    return tid;
+  else if (type == kmp_hier_layer_e::LAYER_LOOP)
+    return 0;
+  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
+  if (tid >= num_hw_threads)
+    tid = tid % num_hw_threads;
+  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
+}
+
+// Return the number of t1's per t2
+int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
+  int i1 = t1 + 1;
+  int i2 = t2 + 1;
+  KMP_DEBUG_ASSERT(i1 <= i2);
+  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
+  // (nthreads/t2) / (nthreads/t1) = t1 / t2
+  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
+}
+#endif // KMP_USE_HIER_SCHED
+
+// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
+// affinity map.
+static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
+                                             int *line,
+                                             kmp_i18n_id_t *const msg_id,
+                                             FILE *f) {
+  *address2os = NULL;
+  *msg_id = kmp_i18n_null;
+
+  // Scan of the file, and count the number of "processor" (osId) fields,
+  // and find the highest value of <n> for a node_<n> field.
+  char buf[256];
+  unsigned num_records = 0;
+  while (!feof(f)) {
+    buf[sizeof(buf) - 1] = 1;
+    if (!fgets(buf, sizeof(buf), f)) {
+      // Read errors presumably because of EOF
+      break;
+    }
+
+    char s1[] = "processor";
+    if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+      num_records++;
+      continue;
+    }
+
+    // FIXME - this will match "node_<n> <garbage>"
+    unsigned level;
+    if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
+      if (nodeIdIndex + level >= maxIndex) {
+        maxIndex = nodeIdIndex + level;
+      }
+      continue;
+    }
+  }
+
+  // Check for empty file / no valid processor records, or too many. The number
+  // of records can't exceed the number of valid bits in the affinity mask.
+  if (num_records == 0) {
+    *line = 0;
+    *msg_id = kmp_i18n_str_NoProcRecords;
+    return -1;
+  }
+  if (num_records > (unsigned)__kmp_xproc) {
+    *line = 0;
+    *msg_id = kmp_i18n_str_TooManyProcRecords;
+    return -1;
+  }
+
+  // Set the file pointer back to the begginning, so that we can scan the file
+  // again, this time performing a full parse of the data. Allocate a vector of
+  // ProcCpuInfo object, where we will place the data. Adding an extra element
+  // at the end allows us to remove a lot of extra checks for termination
+  // conditions.
+  if (fseek(f, 0, SEEK_SET) != 0) {
+    *line = 0;
+    *msg_id = kmp_i18n_str_CantRewindCpuinfo;
+    return -1;
+  }
+
+  // Allocate the array of records to store the proc info in.  The dummy
+  // element at the end makes the logic in filling them out easier to code.
+  unsigned **threadInfo =
+      (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
+  unsigned i;
+  for (i = 0; i <= num_records; i++) {
+    threadInfo[i] =
+        (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  }
+
+#define CLEANUP_THREAD_INFO                                                    \
+  for (i = 0; i <= num_records; i++) {                                         \
+    __kmp_free(threadInfo[i]);                                                 \
+  }                                                                            \
+  __kmp_free(threadInfo);
+
+  // A value of UINT_MAX means that we didn't find the field
+  unsigned __index;
+
+#define INIT_PROC_INFO(p)                                                      \
+  for (__index = 0; __index <= maxIndex; __index++) {                          \
+    (p)[__index] = UINT_MAX;                                                   \
+  }
+
+  for (i = 0; i <= num_records; i++) {
+    INIT_PROC_INFO(threadInfo[i]);
+  }
+
+  unsigned num_avail = 0;
+  *line = 0;
+  while (!feof(f)) {
+    // Create an inner scoping level, so that all the goto targets at the end of
+    // the loop appear in an outer scoping level. This avoids warnings about
+    // jumping past an initialization to a target in the same block.
+    {
+      buf[sizeof(buf) - 1] = 1;
+      bool long_line = false;
+      if (!fgets(buf, sizeof(buf), f)) {
+        // Read errors presumably because of EOF
+        // If there is valid data in threadInfo[num_avail], then fake
+        // a blank line in ensure that the last address gets parsed.
+        bool valid = false;
+        for (i = 0; i <= maxIndex; i++) {
+          if (threadInfo[num_avail][i] != UINT_MAX) {
+            valid = true;
+          }
+        }
+        if (!valid) {
+          break;
+        }
+        buf[0] = 0;
+      } else if (!buf[sizeof(buf) - 1]) {
+        // The line is longer than the buffer.  Set a flag and don't
+        // emit an error if we were going to ignore the line, anyway.
+        long_line = true;
+
+#define CHECK_LINE                                                             \
+  if (long_line) {                                                             \
+    CLEANUP_THREAD_INFO;                                                       \
+    *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
+    return -1;                                                                 \
+  }
+      }
+      (*line)++;
+
+      char s1[] = "processor";
+      if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s1) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
+#if KMP_ARCH_AARCH64
+          // Handle the old AArch64 /proc/cpuinfo layout differently,
+          // it contains all of the 'processor' entries listed in a
+          // single 'Processor' section, therefore the normal looking
+          // for duplicates in that section will always fail.
+          num_avail++;
+#else
+          goto dup_field;
+#endif
+        threadInfo[num_avail][osIdIndex] = val;
+#if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
+        char path[256];
+        KMP_SNPRINTF(
+            path, sizeof(path),
+            "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
+            threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
+
+        KMP_SNPRINTF(path, sizeof(path),
+                     "/sys/devices/system/cpu/cpu%u/topology/core_id",
+                     threadInfo[num_avail][osIdIndex]);
+        __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
+        continue;
+#else
+      }
+      char s2[] = "physical id";
+      if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s2) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][pkgIdIndex] = val;
+        continue;
+      }
+      char s3[] = "core id";
+      if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s3) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][coreIdIndex] = val;
+        continue;
+#endif // KMP_OS_LINUX && USE_SYSFS_INFO
+      }
+      char s4[] = "thread id";
+      if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s4) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][threadIdIndex] = val;
+        continue;
+      }
+      unsigned level;
+      if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
+        CHECK_LINE;
+        char *p = strchr(buf + sizeof(s4) - 1, ':');
+        unsigned val;
+        if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
+          goto no_val;
+        KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+        if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
+          goto dup_field;
+        threadInfo[num_avail][nodeIdIndex + level] = val;
+        continue;
+      }
+
+      // We didn't recognize the leading token on the line. There are lots of
+      // leading tokens that we don't recognize - if the line isn't empty, go on
+      // to the next line.
+      if ((*buf != 0) && (*buf != '\n')) {
+        // If the line is longer than the buffer, read characters
+        // until we find a newline.
+        if (long_line) {
+          int ch;
+          while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
+            ;
+        }
+        continue;
+      }
+
+      // A newline has signalled the end of the processor record.
+      // Check that there aren't too many procs specified.
+      if ((int)num_avail == __kmp_xproc) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_TooManyEntries;
+        return -1;
+      }
+
+      // Check for missing fields.  The osId field must be there, and we
+      // currently require that the physical id field is specified, also.
+      if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_MissingProcField;
+        return -1;
+      }
+      if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_MissingPhysicalIDField;
+        return -1;
+      }
+
+      // Skip this proc if it is not included in the machine model.
+      if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
+                         __kmp_affin_fullMask)) {
+        INIT_PROC_INFO(threadInfo[num_avail]);
+        continue;
+      }
+
+      // We have a successful parse of this proc's info.
+      // Increment the counter, and prepare for the next proc.
+      num_avail++;
+      KMP_ASSERT(num_avail <= num_records);
+      INIT_PROC_INFO(threadInfo[num_avail]);
+    }
+    continue;
+
+  no_val:
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_MissingValCpuinfo;
+    return -1;
+
+  dup_field:
+    CLEANUP_THREAD_INFO;
+    *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
+    return -1;
+  }
+  *line = 0;
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  unsigned teamSize = 0;
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  // check for num_records == __kmp_xproc ???
+
+  // If there's only one thread context to bind to, form an Address object with
+  // depth 1 and return immediately (or, if affinity is off, set address2os to
+  // NULL and return).
+  //
+  // If it is configured to omit the package level when there is only a single
+  // package, the logic at the end of this routine won't work if there is only a
+  // single thread - it would try to form an Address object with depth 0.
+  KMP_ASSERT(num_avail > 0);
+  KMP_ASSERT(num_avail <= num_records);
+  if (num_avail == 1) {
+    __kmp_ncores = 1;
+    __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+    if (__kmp_affinity_verbose) {
+      if (!KMP_AFFINITY_CAPABLE()) {
+        KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        char buf[KMP_AFFIN_MASK_PRINT_LEN];
+        __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                  __kmp_affin_fullMask);
+        KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
+        if (__kmp_affinity_respect_mask) {
+          KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+        } else {
+          KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+        }
+        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      }
+      int index;
+      kmp_str_buf_t buf;
+      __kmp_str_buf_init(&buf);
+      __kmp_str_buf_print(&buf, "1");
+      for (index = maxIndex - 1; index > pkgIdIndex; index--) {
+        __kmp_str_buf_print(&buf, " x 1");
+      }
+      KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
+      __kmp_str_buf_free(&buf);
+    }
+
+    if (__kmp_affinity_type == affinity_none) {
+      CLEANUP_THREAD_INFO;
+      return 0;
+    }
+
+    *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
+    Address addr(1);
+    addr.labels[0] = threadInfo[0][pkgIdIndex];
+    (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
+
+    if (__kmp_affinity_gran_levels < 0) {
+      __kmp_affinity_gran_levels = 0;
+    }
+
+    if (__kmp_affinity_verbose) {
+      __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
+    }
+
+    CLEANUP_THREAD_INFO;
+    return 1;
+  }
+
+  // Sort the threadInfo table by physical Id.
+  qsort(threadInfo, num_avail, sizeof(*threadInfo),
+        __kmp_affinity_cmp_ProcCpuInfo_phys_id);
+
+  // The table is now sorted by pkgId / coreId / threadId, but we really don't
+  // know the radix of any of the fields. pkgId's may be sparsely assigned among
+  // the chips on a system. Although coreId's are usually assigned
+  // [0 .. coresPerPkg-1] and threadId's are usually assigned
+  // [0..threadsPerCore-1], we don't want to make any such assumptions.
+  //
+  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
+  // total # packages) are at this point - we want to determine that now. We
+  // only have an upper bound on the first two figures.
+  unsigned *counts =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *maxCt =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *totals =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+  unsigned *lastId =
+      (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
+
+  bool assign_thread_ids = false;
+  unsigned threadIdCt;
+  unsigned index;
+
+restart_radix_check:
+  threadIdCt = 0;
+
+  // Initialize the counter arrays with data from threadInfo[0].
+  if (assign_thread_ids) {
+    if (threadInfo[0][threadIdIndex] == UINT_MAX) {
+      threadInfo[0][threadIdIndex] = threadIdCt++;
+    } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
+      threadIdCt = threadInfo[0][threadIdIndex] + 1;
+    }
+  }
+  for (index = 0; index <= maxIndex; index++) {
+    counts[index] = 1;
+    maxCt[index] = 1;
+    totals[index] = 1;
+    lastId[index] = threadInfo[0][index];
+    ;
+  }
+
+  // Run through the rest of the OS procs.
+  for (i = 1; i < num_avail; i++) {
+    // Find the most significant index whose id differs from the id for the
+    // previous OS proc.
+    for (index = maxIndex; index >= threadIdIndex; index--) {
+      if (assign_thread_ids && (index == threadIdIndex)) {
+        // Auto-assign the thread id field if it wasn't specified.
+        if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+          threadInfo[i][threadIdIndex] = threadIdCt++;
+        }
+        // Apparently the thread id field was specified for some entries and not
+        // others. Start the thread id counter off at the next higher thread id.
+        else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+          threadIdCt = threadInfo[i][threadIdIndex] + 1;
+        }
+      }
+      if (threadInfo[i][index] != lastId[index]) {
+        // Run through all indices which are less significant, and reset the
+        // counts to 1. At all levels up to and including index, we need to
+        // increment the totals and record the last id.
+        unsigned index2;
+        for (index2 = threadIdIndex; index2 < index; index2++) {
+          totals[index2]++;
+          if (counts[index2] > maxCt[index2]) {
+            maxCt[index2] = counts[index2];
+          }
+          counts[index2] = 1;
+          lastId[index2] = threadInfo[i][index2];
+        }
+        counts[index]++;
+        totals[index]++;
+        lastId[index] = threadInfo[i][index];
+
+        if (assign_thread_ids && (index > threadIdIndex)) {
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+          // The default team size is the total #threads in the machine
+          // minus 1 thread for every core that has 3 or more threads.
+          teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+          // Restart the thread counter, as we are on a new core.
+          threadIdCt = 0;
+
+          // Auto-assign the thread id field if it wasn't specified.
+          if (threadInfo[i][threadIdIndex] == UINT_MAX) {
+            threadInfo[i][threadIdIndex] = threadIdCt++;
+          }
+
+          // Aparrently the thread id field was specified for some entries and
+          // not others. Start the thread id counter off at the next higher
+          // thread id.
+          else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
+            threadIdCt = threadInfo[i][threadIdIndex] + 1;
+          }
+        }
+        break;
+      }
+    }
+    if (index < threadIdIndex) {
+      // If thread ids were specified, it is an error if they are not unique.
+      // Also, check that we waven't already restarted the loop (to be safe -
+      // shouldn't need to).
+      if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
+        __kmp_free(lastId);
+        __kmp_free(totals);
+        __kmp_free(maxCt);
+        __kmp_free(counts);
+        CLEANUP_THREAD_INFO;
+        *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
+        return -1;
+      }
+
+      // If the thread ids were not specified and we see entries entries that
+      // are duplicates, start the loop over and assign the thread ids manually.
+      assign_thread_ids = true;
+      goto restart_radix_check;
+    }
+  }
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  // The default team size is the total #threads in the machine
+  // minus 1 thread for every core that has 3 or more threads.
+  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  for (index = threadIdIndex; index <= maxIndex; index++) {
+    if (counts[index] > maxCt[index]) {
+      maxCt[index] = counts[index];
+    }
+  }
+
+  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
+  nCoresPerPkg = maxCt[coreIdIndex];
+  nPackages = totals[pkgIdIndex];
+
+  // Check to see if the machine topology is uniform
+  unsigned prod = totals[maxIndex];
+  for (index = threadIdIndex; index < maxIndex; index++) {
+    prod *= maxCt[index];
+  }
+  bool uniform = (prod == totals[threadIdIndex]);
+
+  // When affinity is off, this routine will still be called to set
+  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
+  // Make sure all these vars are set correctly, and return now if affinity is
+  // not enabled.
+  __kmp_ncores = totals[coreIdIndex];
+
+  if (__kmp_affinity_verbose) {
+    if (!KMP_AFFINITY_CAPABLE()) {
+      KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (uniform) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+    } else {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                                __kmp_affin_fullMask);
+      KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
+      if (__kmp_affinity_respect_mask) {
+        KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
+      } else {
+        KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
+      }
+      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
+      if (uniform) {
+        KMP_INFORM(Uniform, "KMP_AFFINITY");
+      } else {
+        KMP_INFORM(NonUniform, "KMP_AFFINITY");
+      }
+    }
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+
+    __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
+    for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
+      __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
+    }
+    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
+               maxCt[threadIdIndex], __kmp_ncores);
+
+    __kmp_str_buf_free(&buf);
+  }
+
+#if KMP_MIC && REDUCE_TEAM_SIZE
+  // Set the default team size.
+  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
+    __kmp_dflt_team_nth = teamSize;
+    KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
+                  "__kmp_dflt_team_nth = %d\n",
+                  __kmp_dflt_team_nth));
+  }
+#endif // KMP_MIC && REDUCE_TEAM_SIZE
+
+  KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
+  KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
+  __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
+  for (i = 0; i < num_avail; ++i) { // fill the os indices
+    __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
+  }
+
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_free(lastId);
+    __kmp_free(totals);
+    __kmp_free(maxCt);
+    __kmp_free(counts);
+    CLEANUP_THREAD_INFO;
+    return 0;
+  }
+
+  // Count the number of levels which have more nodes at that level than at the
+  // parent's level (with there being an implicit root node of the top level).
+  // This is equivalent to saying that there is at least one node at this level
+  // which has a sibling. These levels are in the map, and the package level is
+  // always in the map.
+  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
+  for (index = threadIdIndex; index < maxIndex; index++) {
+    KMP_ASSERT(totals[index] >= totals[index + 1]);
+    inMap[index] = (totals[index] > totals[index + 1]);
+  }
+  inMap[maxIndex] = (totals[maxIndex] > 1);
+  inMap[pkgIdIndex] = true;
+
+  int depth = 0;
+  for (index = threadIdIndex; index <= maxIndex; index++) {
+    if (inMap[index]) {
+      depth++;
+    }
+  }
+  KMP_ASSERT(depth > 0);
+
+  // Construct the data structure that is to be returned.
+  *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
+  int pkgLevel = -1;
+  int coreLevel = -1;
+  int threadLevel = -1;
+
+  for (i = 0; i < num_avail; ++i) {
+    Address addr(depth);
+    unsigned os = threadInfo[i][osIdIndex];
+    int src_index;
+    int dst_index = 0;
+
+    for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
+      if (!inMap[src_index]) {
+        continue;
+      }
+      addr.labels[dst_index] = threadInfo[i][src_index];
+      if (src_index == pkgIdIndex) {
+        pkgLevel = dst_index;
+      } else if (src_index == coreIdIndex) {
+        coreLevel = dst_index;
+      } else if (src_index == threadIdIndex) {
+        threadLevel = dst_index;
+      }
+      dst_index++;
+    }
+    (*address2os)[i] = AddrUnsPair(addr, os);
+  }
+
+  if (__kmp_affinity_gran_levels < 0) {
+    // Set the granularity level based on what levels are modeled
+    // in the machine topology map.
+    unsigned src_index;
+    __kmp_affinity_gran_levels = 0;
+    for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
+      if (!inMap[src_index]) {
+        continue;
+      }
+      switch (src_index) {
+      case threadIdIndex:
+        if (__kmp_affinity_gran > affinity_gran_thread) {
+          __kmp_affinity_gran_levels++;
+        }
+
+        break;
+      case coreIdIndex:
+        if (__kmp_affinity_gran > affinity_gran_core) {
+          __kmp_affinity_gran_levels++;
+        }
+        break;
+
+      case pkgIdIndex:
+        if (__kmp_affinity_gran > affinity_gran_package) {
+          __kmp_affinity_gran_levels++;
+        }
+        break;
+      }
+    }
+  }
+
+  if (__kmp_affinity_verbose) {
+    __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
+                                  coreLevel, threadLevel);
+  }
+
+  __kmp_free(inMap);
+  __kmp_free(lastId);
+  __kmp_free(totals);
+  __kmp_free(maxCt);
+  __kmp_free(counts);
+  CLEANUP_THREAD_INFO;
+  return depth;
+}
+
+// Create and return a table of affinity masks, indexed by OS thread ID.
+// This routine handles OR'ing together all the affinity masks of threads
+// that are sufficiently close, if granularity > fine.
+static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
+                                            unsigned *numUnique,
+                                            AddrUnsPair *address2os,
+                                            unsigned numAddrs) {
+  // First form a table of affinity masks in order of OS thread id.
+  unsigned depth;
+  unsigned maxOsId;
+  unsigned i;
+
+  KMP_ASSERT(numAddrs > 0);
+  depth = address2os[0].first.depth;
+
+  maxOsId = 0;
+  for (i = numAddrs - 1;; --i) {
+    unsigned osId = address2os[i].second;
+    if (osId > maxOsId) {
+      maxOsId = osId;
+    }
+    if (i == 0)
+      break;
+  }
+  kmp_affin_mask_t *osId2Mask;
+  KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
+
+  // Sort the address2os table according to physical order. Doing so will put
+  // all threads on the same core/package/node in consecutive locations.
+  qsort(address2os, numAddrs, sizeof(*address2os),
+        __kmp_affinity_cmp_Address_labels);
+
+  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
+  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
+    KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
+  }
+  if (__kmp_affinity_gran_levels >= (int)depth) {
+    if (__kmp_affinity_verbose ||
+        (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
+      KMP_WARNING(AffThreadsMayMigrate);
+    }
+  }
+
+  // Run through the table, forming the masks for all threads on each core.
+  // Threads on the same core will have identical "Address" objects, not
+  // considering the last level, which must be the thread id. All threads on a
+  // core will appear consecutively.
+  unsigned unique = 0;
+  unsigned j = 0; // index of 1st thread on core
+  unsigned leader = 0;
+  Address *leaderAddr = &(address2os[0].first);
+  kmp_affin_mask_t *sum;
+  KMP_CPU_ALLOC_ON_STACK(sum);
+  KMP_CPU_ZERO(sum);
+  KMP_CPU_SET(address2os[0].second, sum);
+  for (i = 1; i < numAddrs; i++) {
+    // If this thread is sufficiently close to the leader (within the
+    // granularity setting), then set the bit for this os thread in the
+    // affinity mask for this group, and go on to the next thread.
+    if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
+      KMP_CPU_SET(address2os[i].second, sum);
+      continue;
+    }
+
+    // For every thread in this group, copy the mask to the thread's entry in
+    // the osId2Mask table.  Mark the first address as a leader.
+    for (; j < i; j++) {
+      unsigned osId = address2os[j].second;
+      KMP_DEBUG_ASSERT(osId <= maxOsId);
+      kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+      KMP_CPU_COPY(mask, sum);
+      address2os[j].first.leader = (j == leader);
+    }
+    unique++;
+
+    // Start a new mask.
+    leader = i;
+    leaderAddr = &(address2os[i].first);
+    KMP_CPU_ZERO(sum);
+    KMP_CPU_SET(address2os[i].second, sum);
+  }
+
+  // For every thread in last group, copy the mask to the thread's
+  // entry in the osId2Mask table.
+  for (; j < i; j++) {
+    unsigned osId = address2os[j].second;
+    KMP_DEBUG_ASSERT(osId <= maxOsId);
+    kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
+    KMP_CPU_COPY(mask, sum);
+    address2os[j].first.leader = (j == leader);
+  }
+  unique++;
+  KMP_CPU_FREE_FROM_STACK(sum);
+
+  *maxIndex = maxOsId;
+  *numUnique = unique;
+  return osId2Mask;
+}
+
+// Stuff for the affinity proclist parsers.  It's easier to declare these vars
+// as file-static than to try and pass them through the calling sequence of
+// the recursive-descent OMP_PLACES parser.
+static kmp_affin_mask_t *newMasks;
+static int numNewMasks;
+static int nextNewMask;
+
+#define ADD_MASK(_mask)                                                        \
+  {                                                                            \
+    if (nextNewMask >= numNewMasks) {                                          \
+      int i;                                                                   \
+      numNewMasks *= 2;                                                        \
+      kmp_affin_mask_t *temp;                                                  \
+      KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
+      for (i = 0; i < numNewMasks / 2; i++) {                                  \
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
+        KMP_CPU_COPY(dest, src);                                               \
+      }                                                                        \
+      KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
+      newMasks = temp;                                                         \
+    }                                                                          \
+    KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
+    nextNewMask++;                                                             \
+  }
+
+#define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
+  {                                                                            \
+    if (((_osId) > _maxOsId) ||                                                \
+        (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
+      if (__kmp_affinity_verbose ||                                            \
+          (__kmp_affinity_warnings &&                                          \
+           (__kmp_affinity_type != affinity_none))) {                          \
+        KMP_WARNING(AffIgnoreInvalidProcID, _osId);                            \
+      }                                                                        \
+    } else {                                                                   \
+      ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
+    }                                                                          \
+  }
+
+// Re-parse the proclist (for the explicit affinity type), and form the list
+// of affinity newMasks indexed by gtid.
+static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
+                                            unsigned int *out_numMasks,
+                                            const char *proclist,
+                                            kmp_affin_mask_t *osId2Mask,
+                                            int maxOsId) {
+  int i;
+  const char *scan = proclist;
+  const char *next = proclist;
+
+  // We use malloc() for the temporary mask vector, so that we can use
+  // realloc() to extend it.
+  numNewMasks = 2;
+  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
+  nextNewMask = 0;
+  kmp_affin_mask_t *sumMask;
+  KMP_CPU_ALLOC(sumMask);
+  int setSize = 0;
+
+  for (;;) {
+    int start, end, stride;
+
+    SKIP_WS(scan);
+    next = scan;
+    if (*next == '\0') {
+      break;
+    }
+
+    if (*next == '{') {
+      int num;
+      setSize = 0;
+      next++; // skip '{'
+      SKIP_WS(next);
+      scan = next;
+
+      // Read the first integer in the set.
+      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
+      SKIP_DIGITS(next);
+      num = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT2(num >= 0, "bad explicit proc list");
+
+      // Copy the mask for that osId to the sum (union) mask.
+      if ((num > maxOsId) ||
+          (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none))) {
+          KMP_WARNING(AffIgnoreInvalidProcID, num);
+        }
+        KMP_CPU_ZERO(sumMask);
+      } else {
+        KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+        setSize = 1;
+      }
+
+      for (;;) {
+        // Check for end of set.
+        SKIP_WS(next);
+        if (*next == '}') {
+          next++; // skip '}'
+          break;
+        }
+
+        // Skip optional comma.
+        if (*next == ',') {
+          next++;
+        }
+        SKIP_WS(next);
+
+        // Read the next integer in the set.
+        scan = next;
+        KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+
+        SKIP_DIGITS(next);
+        num = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT2(num >= 0, "bad explicit proc list");
+
+        // Add the mask for that osId to the sum mask.
+        if ((num > maxOsId) ||
+            (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+          if (__kmp_affinity_verbose ||
+              (__kmp_affinity_warnings &&
+               (__kmp_affinity_type != affinity_none))) {
+            KMP_WARNING(AffIgnoreInvalidProcID, num);
+          }
+        } else {
+          KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
+          setSize++;
+        }
+      }
+      if (setSize > 0) {
+        ADD_MASK(sumMask);
+      }
+
+      SKIP_WS(next);
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // Read the first integer.
+    KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT2(start >= 0, "bad explicit proc list");
+    SKIP_WS(next);
+
+    // If this isn't a range, then add a mask to the list and go on.
+    if (*next != '-') {
+      ADD_MASK_OSID(start, osId2Mask, maxOsId);
+
+      // Skip optional comma.
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // This is a range.  Skip over the '-' and read in the 2nd int.
+    next++; // skip '-'
+    SKIP_WS(next);
+    scan = next;
+    KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+    SKIP_DIGITS(next);
+    end = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT2(end >= 0, "bad explicit proc list");
+
+    // Check for a stride parameter
+    stride = 1;
+    SKIP_WS(next);
+    if (*next == ':') {
+      // A stride is specified.  Skip over the ':" and read the 3rd int.
+      int sign = +1;
+      next++; // skip ':'
+      SKIP_WS(next);
+      scan = next;
+      if (*next == '-') {
+        sign = -1;
+        next++;
+        SKIP_WS(next);
+        scan = next;
+      }
+      KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT2(stride >= 0, "bad explicit proc list");
+      stride *= sign;
+    }
+
+    // Do some range checks.
+    KMP_ASSERT2(stride != 0, "bad explicit proc list");
+    if (stride > 0) {
+      KMP_ASSERT2(start <= end, "bad explicit proc list");
+    } else {
+      KMP_ASSERT2(start >= end, "bad explicit proc list");
+    }
+    KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
+
+    // Add the mask for each OS proc # to the list.
+    if (stride > 0) {
+      do {
+        ADD_MASK_OSID(start, osId2Mask, maxOsId);
+        start += stride;
+      } while (start <= end);
+    } else {
+      do {
+        ADD_MASK_OSID(start, osId2Mask, maxOsId);
+        start += stride;
+      } while (start >= end);
+    }
+
+    // Skip optional comma.
+    SKIP_WS(next);
+    if (*next == ',') {
+      next++;
+    }
+    scan = next;
+  }
+
+  *out_numMasks = nextNewMask;
+  if (nextNewMask == 0) {
+    *out_masks = NULL;
+    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+    return;
+  }
+  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+  for (i = 0; i < nextNewMask; i++) {
+    kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
+    kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
+    KMP_CPU_COPY(dest, src);
+  }
+  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+  KMP_CPU_FREE(sumMask);
+}
+
+/*-----------------------------------------------------------------------------
+Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
+places.  Again, Here is the grammar:
+
+place_list := place
+place_list := place , place_list
+place := num
+place := place : num
+place := place : num : signed
+place := { subplacelist }
+place := ! place                  // (lowest priority)
+subplace_list := subplace
+subplace_list := subplace , subplace_list
+subplace := num
+subplace := num : num
+subplace := num : num : signed
+signed := num
+signed := + signed
+signed := - signed
+-----------------------------------------------------------------------------*/
+static void __kmp_process_subplace_list(const char **scan,
+                                        kmp_affin_mask_t *osId2Mask,
+                                        int maxOsId, kmp_affin_mask_t *tempMask,
+                                        int *setSize) {
+  const char *next;
+
+  for (;;) {
+    int start, count, stride, i;
+
+    // Read in the starting proc id
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(start >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      if ((start > maxOsId) ||
+          (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none))) {
+          KMP_WARNING(AffIgnoreInvalidProcID, start);
+        }
+      } else {
+        KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+        (*setSize)++;
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
+    }
+    KMP_ASSERT2(**scan == ':', "bad explicit places list");
+    (*scan)++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(count >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      for (i = 0; i < count; i++) {
+        if ((start > maxOsId) ||
+            (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+          if (__kmp_affinity_verbose ||
+              (__kmp_affinity_warnings &&
+               (__kmp_affinity_type != affinity_none))) {
+            KMP_WARNING(AffIgnoreInvalidProcID, start);
+          }
+          break; // don't proliferate warnings for large count
+        } else {
+          KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+          start++;
+          (*setSize)++;
+        }
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
+    }
+    KMP_ASSERT2(**scan == ':', "bad explicit places list");
+    (*scan)++; // skip ':'
+
+    // Read stride parameter
+    int sign = +1;
+    for (;;) {
+      SKIP_WS(*scan);
+      if (**scan == '+') {
+        (*scan)++; // skip '+'
+        continue;
+      }
+      if (**scan == '-') {
+        sign *= -1;
+        (*scan)++; // skip '-'
+        continue;
+      }
+      break;
+    }
+    SKIP_WS(*scan);
+    KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
+    next = *scan;
+    SKIP_DIGITS(next);
+    stride = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(stride >= 0);
+    *scan = next;
+    stride *= sign;
+
+    // valid follow sets are ',' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}' || **scan == ',') {
+      for (i = 0; i < count; i++) {
+        if ((start > maxOsId) ||
+            (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
+          if (__kmp_affinity_verbose ||
+              (__kmp_affinity_warnings &&
+               (__kmp_affinity_type != affinity_none))) {
+            KMP_WARNING(AffIgnoreInvalidProcID, start);
+          }
+          break; // don't proliferate warnings for large count
+        } else {
+          KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
+          start += stride;
+          (*setSize)++;
+        }
+      }
+      if (**scan == '}') {
+        break;
+      }
+      (*scan)++; // skip ','
+      continue;
+    }
+
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+}
+
+static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
+                                int maxOsId, kmp_affin_mask_t *tempMask,
+                                int *setSize) {
+  const char *next;
+
+  // valid follow sets are '{' '!' and num
+  SKIP_WS(*scan);
+  if (**scan == '{') {
+    (*scan)++; // skip '{'
+    __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
+    KMP_ASSERT2(**scan == '}', "bad explicit places list");
+    (*scan)++; // skip '}'
+  } else if (**scan == '!') {
+    (*scan)++; // skip '!'
+    __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
+    KMP_CPU_COMPLEMENT(maxOsId, tempMask);
+  } else if ((**scan >= '0') && (**scan <= '9')) {
+    next = *scan;
+    SKIP_DIGITS(next);
+    int num = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(num >= 0);
+    if ((num > maxOsId) ||
+        (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
+      if (__kmp_affinity_verbose ||
+          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
+        KMP_WARNING(AffIgnoreInvalidProcID, num);
+      }
+    } else {
+      KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
+      (*setSize)++;
+    }
+    *scan = next; // skip num
+  } else {
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+}
+
+// static void
+void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
+                                      unsigned int *out_numMasks,
+                                      const char *placelist,
+                                      kmp_affin_mask_t *osId2Mask,
+                                      int maxOsId) {
+  int i, j, count, stride, sign;
+  const char *scan = placelist;
+  const char *next = placelist;
+
+  numNewMasks = 2;
+  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
+  nextNewMask = 0;
+
+  // tempMask is modified based on the previous or initial
+  //   place to form the current place
+  // previousMask contains the previous place
+  kmp_affin_mask_t *tempMask;
+  kmp_affin_mask_t *previousMask;
+  KMP_CPU_ALLOC(tempMask);
+  KMP_CPU_ZERO(tempMask);
+  KMP_CPU_ALLOC(previousMask);
+  KMP_CPU_ZERO(previousMask);
+  int setSize = 0;
+
+  for (;;) {
+    __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0' || *scan == ',') {
+      if (setSize > 0) {
+        ADD_MASK(tempMask);
+      }
+      KMP_CPU_ZERO(tempMask);
+      setSize = 0;
+      if (*scan == '\0') {
+        break;
+      }
+      scan++; // skip ','
+      continue;
+    }
+
+    KMP_ASSERT2(*scan == ':', "bad explicit places list");
+    scan++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(scan);
+    KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
+    next = scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(count >= 0);
+    scan = next;
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0' || *scan == ',') {
+      stride = +1;
+    } else {
+      KMP_ASSERT2(*scan == ':', "bad explicit places list");
+      scan++; // skip ':'
+
+      // Read stride parameter
+      sign = +1;
+      for (;;) {
+        SKIP_WS(scan);
+        if (*scan == '+') {
+          scan++; // skip '+'
+          continue;
+        }
+        if (*scan == '-') {
+          sign *= -1;
+          scan++; // skip '-'
+          continue;
+        }
+        break;
+      }
+      SKIP_WS(scan);
+      KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
+      next = scan;
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_DEBUG_ASSERT(stride >= 0);
+      scan = next;
+      stride *= sign;
+    }
+
+    // Add places determined by initial_place : count : stride
+    for (i = 0; i < count; i++) {
+      if (setSize == 0) {
+        break;
+      }
+      // Add the current place, then build the next place (tempMask) from that
+      KMP_CPU_COPY(previousMask, tempMask);
+      ADD_MASK(previousMask);
+      KMP_CPU_ZERO(tempMask);
+      setSize = 0;
+      KMP_CPU_SET_ITERATE(j, previousMask) {
+        if (!KMP_CPU_ISSET(j, previousMask)) {
+          continue;
+        }
+        if ((j + stride > maxOsId) || (j + stride < 0) ||
+            (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
+            (!KMP_CPU_ISSET(j + stride,
+                            KMP_CPU_INDEX(osId2Mask, j + stride)))) {
+          if ((__kmp_affinity_verbose ||
+               (__kmp_affinity_warnings &&
+                (__kmp_affinity_type != affinity_none))) &&
+              i < count - 1) {
+            KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
+          }
+          continue;
+        }
+        KMP_CPU_SET(j + stride, tempMask);
+        setSize++;
+      }
+    }
+    KMP_CPU_ZERO(tempMask);
+    setSize = 0;
+
+    // valid follow sets are ',' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+
+    KMP_ASSERT2(0, "bad explicit places list");
+  }
+
+  *out_numMasks = nextNewMask;
+  if (nextNewMask == 0) {
+    *out_masks = NULL;
+    KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+    return;
+  }
+  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
+  KMP_CPU_FREE(tempMask);
+  KMP_CPU_FREE(previousMask);
+  for (i = 0; i < nextNewMask; i++) {
+    kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
+    kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
+    KMP_CPU_COPY(dest, src);
+  }
+  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
+}
+
+#undef ADD_MASK
+#undef ADD_MASK_OSID
+
+#if KMP_USE_HWLOC
+static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
+  // skip PUs descendants of the object o
+  int skipped = 0;
+  hwloc_obj_t hT = NULL;
+  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
+  for (int i = 0; i < N; ++i) {
+    KMP_DEBUG_ASSERT(hT);
+    unsigned idx = hT->os_index;
+    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+      KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+      KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+      ++skipped;
+    }
+    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
+  }
+  return skipped; // count number of skipped units
+}
+
+static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
+  // check if obj has PUs present in fullMask
+  hwloc_obj_t hT = NULL;
+  int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
+  for (int i = 0; i < N; ++i) {
+    KMP_DEBUG_ASSERT(hT);
+    unsigned idx = hT->os_index;
+    if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
+      return 1; // found PU
+    hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
+  }
+  return 0; // no PUs found
+}
+#endif // KMP_USE_HWLOC
+
+static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
+  AddrUnsPair *newAddr;
+  if (__kmp_hws_requested == 0)
+    goto _exit; // no topology limiting actions requested, exit
+#if KMP_USE_HWLOC
+  if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
+    // Number of subobjects calculated dynamically, this works fine for
+    // any non-uniform topology.
+    // L2 cache objects are determined by depth, other objects - by type.
+    hwloc_topology_t tp = __kmp_hwloc_topology;
+    int nS = 0, nN = 0, nL = 0, nC = 0,
+        nT = 0; // logical index including skipped
+    int nCr = 0, nTr = 0; // number of requested units
+    int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
+    hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
+    int L2depth, idx;
+
+    // check support of extensions ----------------------------------
+    int numa_support = 0, tile_support = 0;
+    if (__kmp_pu_os_idx)
+      hT = hwloc_get_pu_obj_by_os_index(tp,
+                                        __kmp_pu_os_idx[__kmp_avail_proc - 1]);
+    else
+      hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
+    if (hT == NULL) { // something's gone wrong
+      KMP_WARNING(AffHWSubsetUnsupported);
+      goto _exit;
+    }
+    // check NUMA node
+    hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
+    hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
+    if (hN != NULL && hN->depth > hS->depth) {
+      numa_support = 1; // 1 in case socket includes node(s)
+    } else if (__kmp_hws_node.num > 0) {
+      // don't support sockets inside NUMA node (no such HW found for testing)
+      KMP_WARNING(AffHWSubsetUnsupported);
+      goto _exit;
+    }
+    // check L2 cahce, get object by depth because of multiple caches
+    L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
+    hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
+    if (hL != NULL &&
+        __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
+      tile_support = 1; // no sense to count L2 if it includes single core
+    } else if (__kmp_hws_tile.num > 0) {
+      if (__kmp_hws_core.num == 0) {
+        __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
+        __kmp_hws_tile.num = 0;
+      } else {
+        // L2 and core are both requested, but represent same object
+        KMP_WARNING(AffHWSubsetInvalid);
+        goto _exit;
+      }
+    }
+    // end of check of extensions -----------------------------------
+
+    // fill in unset items, validate settings -----------------------
+    if (__kmp_hws_socket.num == 0)
+      __kmp_hws_socket.num = nPackages; // use all available sockets
+    if (__kmp_hws_socket.offset >= nPackages) {
+      KMP_WARNING(AffHWSubsetManySockets);
+      goto _exit;
+    }
+    if (numa_support) {
+      hN = NULL;
+      int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
+                                                  &hN); // num nodes in socket
+      if (__kmp_hws_node.num == 0)
+        __kmp_hws_node.num = NN; // use all available nodes
+      if (__kmp_hws_node.offset >= NN) {
+        KMP_WARNING(AffHWSubsetManyNodes);
+        goto _exit;
+      }
+      if (tile_support) {
+        // get num tiles in node
+        int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
+        if (__kmp_hws_tile.num == 0) {
+          __kmp_hws_tile.num = NL + 1;
+        } // use all available tiles, some node may have more tiles, thus +1
+        if (__kmp_hws_tile.offset >= NL) {
+          KMP_WARNING(AffHWSubsetManyTiles);
+          goto _exit;
+        }
+        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in tile
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC; // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
+          goto _exit;
+        }
+      } else { // tile_support
+        int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in node
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC; // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
+          goto _exit;
+        }
+      } // tile_support
+    } else { // numa_support
+      if (tile_support) {
+        // get num tiles in socket
+        int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
+        if (__kmp_hws_tile.num == 0)
+          __kmp_hws_tile.num = NL; // use all available tiles
+        if (__kmp_hws_tile.offset >= NL) {
+          KMP_WARNING(AffHWSubsetManyTiles);
+          goto _exit;
+        }
+        int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in tile
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC; // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
+          goto _exit;
+        }
+      } else { // tile_support
+        int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
+                                                    &hC); // num cores in socket
+        if (__kmp_hws_core.num == 0)
+          __kmp_hws_core.num = NC; // use all available cores
+        if (__kmp_hws_core.offset >= NC) {
+          KMP_WARNING(AffHWSubsetManyCores);
+          goto _exit;
+        }
+      } // tile_support
+    }
+    if (__kmp_hws_proc.num == 0)
+      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
+    if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
+      KMP_WARNING(AffHWSubsetManyProcs);
+      goto _exit;
+    }
+    // end of validation --------------------------------------------
+
+    if (pAddr) // pAddr is NULL in case of affinity_none
+      newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
+                                              __kmp_avail_proc); // max size
+    // main loop to form HW subset ----------------------------------
+    hS = NULL;
+    int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
+    for (int s = 0; s < NP; ++s) {
+      // Check Socket -----------------------------------------------
+      hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
+      if (!__kmp_hwloc_obj_has_PUs(tp, hS))
+        continue; // skip socket if all PUs are out of fullMask
+      ++nS; // only count objects those have PUs in affinity mask
+      if (nS <= __kmp_hws_socket.offset ||
+          nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
+        n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
+        continue; // move to next socket
+      }
+      nCr = 0; // count number of cores per socket
+      // socket requested, go down the topology tree
+      // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
+      if (numa_support) {
+        nN = 0;
+        hN = NULL;
+        // num nodes in current socket
+        int NN =
+            __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
+        for (int n = 0; n < NN; ++n) {
+          // Check NUMA Node ----------------------------------------
+          if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
+            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+            continue; // skip node if all PUs are out of fullMask
+          }
+          ++nN;
+          if (nN <= __kmp_hws_node.offset ||
+              nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
+            // skip node as not requested
+            n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
+            hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+            continue; // move to next node
+          }
+          // node requested, go down the topology tree
+          if (tile_support) {
+            nL = 0;
+            hL = NULL;
+            int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
+            for (int l = 0; l < NL; ++l) {
+              // Check L2 (tile) ------------------------------------
+              if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
+                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+                continue; // skip tile if all PUs are out of fullMask
+              }
+              ++nL;
+              if (nL <= __kmp_hws_tile.offset ||
+                  nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
+                // skip tile as not requested
+                n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
+                hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+                continue; // move to next tile
+              }
+              // tile requested, go down the topology tree
+              nC = 0;
+              hC = NULL;
+              // num cores in current tile
+              int NC = __kmp_hwloc_count_children_by_type(tp, hL,
+                                                          HWLOC_OBJ_CORE, &hC);
+              for (int c = 0; c < NC; ++c) {
+                // Check Core ---------------------------------------
+                if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                  continue; // skip core if all PUs are out of fullMask
+                }
+                ++nC;
+                if (nC <= __kmp_hws_core.offset ||
+                    nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                  // skip node as not requested
+                  n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                  hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                  continue; // move to next node
+                }
+                // core requested, go down to PUs
+                nT = 0;
+                nTr = 0;
+                hT = NULL;
+                // num procs in current core
+                int NT = __kmp_hwloc_count_children_by_type(tp, hC,
+                                                            HWLOC_OBJ_PU, &hT);
+                for (int t = 0; t < NT; ++t) {
+                  // Check PU ---------------------------------------
+                  idx = hT->os_index;
+                  if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                    continue; // skip PU if not in fullMask
+                  }
+                  ++nT;
+                  if (nT <= __kmp_hws_proc.offset ||
+                      nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                    // skip PU
+                    KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                    ++n_old;
+                    KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                    hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                    continue; // move to next node
+                  }
+                  ++nTr;
+                  if (pAddr) // collect requested thread's data
+                    newAddr[n_new] = (*pAddr)[n_old];
+                  ++n_new;
+                  ++n_old;
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                } // threads loop
+                if (nTr > 0) {
+                  ++nCr; // num cores per socket
+                  ++nCo; // total num cores
+                  if (nTr > nTpC)
+                    nTpC = nTr; // calc max threads per core
+                }
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+              } // cores loop
+              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+            } // tiles loop
+          } else { // tile_support
+            // no tiles, check cores
+            nC = 0;
+            hC = NULL;
+            // num cores in current node
+            int NC =
+                __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
+            for (int c = 0; c < NC; ++c) {
+              // Check Core ---------------------------------------
+              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // skip core if all PUs are out of fullMask
+              }
+              ++nC;
+              if (nC <= __kmp_hws_core.offset ||
+                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                // skip node as not requested
+                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // move to next node
+              }
+              // core requested, go down to PUs
+              nT = 0;
+              nTr = 0;
+              hT = NULL;
+              int NT =
+                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
+              for (int t = 0; t < NT; ++t) {
+                // Check PU ---------------------------------------
+                idx = hT->os_index;
+                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // skip PU if not in fullMask
+                }
+                ++nT;
+                if (nT <= __kmp_hws_proc.offset ||
+                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                  // skip PU
+                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                  ++n_old;
+                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // move to next node
+                }
+                ++nTr;
+                if (pAddr) // collect requested thread's data
+                  newAddr[n_new] = (*pAddr)[n_old];
+                ++n_new;
+                ++n_old;
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+              } // threads loop
+              if (nTr > 0) {
+                ++nCr; // num cores per socket
+                ++nCo; // total num cores
+                if (nTr > nTpC)
+                  nTpC = nTr; // calc max threads per core
+              }
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+            } // cores loop
+          } // tiles support
+          hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
+        } // nodes loop
+      } else { // numa_support
+        // no NUMA support
+        if (tile_support) {
+          nL = 0;
+          hL = NULL;
+          // num tiles in current socket
+          int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
+          for (int l = 0; l < NL; ++l) {
+            // Check L2 (tile) ------------------------------------
+            if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
+              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+              continue; // skip tile if all PUs are out of fullMask
+            }
+            ++nL;
+            if (nL <= __kmp_hws_tile.offset ||
+                nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
+              // skip tile as not requested
+              n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
+              hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+              continue; // move to next tile
+            }
+            // tile requested, go down the topology tree
+            nC = 0;
+            hC = NULL;
+            // num cores per tile
+            int NC =
+                __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
+            for (int c = 0; c < NC; ++c) {
+              // Check Core ---------------------------------------
+              if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // skip core if all PUs are out of fullMask
+              }
+              ++nC;
+              if (nC <= __kmp_hws_core.offset ||
+                  nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+                // skip node as not requested
+                n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+                hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+                continue; // move to next node
+              }
+              // core requested, go down to PUs
+              nT = 0;
+              nTr = 0;
+              hT = NULL;
+              // num procs per core
+              int NT =
+                  __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
+              for (int t = 0; t < NT; ++t) {
+                // Check PU ---------------------------------------
+                idx = hT->os_index;
+                if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // skip PU if not in fullMask
+                }
+                ++nT;
+                if (nT <= __kmp_hws_proc.offset ||
+                    nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                  // skip PU
+                  KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                  ++n_old;
+                  KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                  hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                  continue; // move to next node
+                }
+                ++nTr;
+                if (pAddr) // collect requested thread's data
+                  newAddr[n_new] = (*pAddr)[n_old];
+                ++n_new;
+                ++n_old;
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+              } // threads loop
+              if (nTr > 0) {
+                ++nCr; // num cores per socket
+                ++nCo; // total num cores
+                if (nTr > nTpC)
+                  nTpC = nTr; // calc max threads per core
+              }
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+            } // cores loop
+            hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
+          } // tiles loop
+        } else { // tile_support
+          // no tiles, check cores
+          nC = 0;
+          hC = NULL;
+          // num cores in socket
+          int NC =
+              __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
+          for (int c = 0; c < NC; ++c) {
+            // Check Core -------------------------------------------
+            if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+              continue; // skip core if all PUs are out of fullMask
+            }
+            ++nC;
+            if (nC <= __kmp_hws_core.offset ||
+                nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
+              // skip node as not requested
+              n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
+              hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+              continue; // move to next node
+            }
+            // core requested, go down to PUs
+            nT = 0;
+            nTr = 0;
+            hT = NULL;
+            // num procs per core
+            int NT =
+                __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
+            for (int t = 0; t < NT; ++t) {
+              // Check PU ---------------------------------------
+              idx = hT->os_index;
+              if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                continue; // skip PU if not in fullMask
+              }
+              ++nT;
+              if (nT <= __kmp_hws_proc.offset ||
+                  nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
+                // skip PU
+                KMP_CPU_CLR(idx, __kmp_affin_fullMask);
+                ++n_old;
+                KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
+                hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+                continue; // move to next node
+              }
+              ++nTr;
+              if (pAddr) // collect requested thread's data
+                newAddr[n_new] = (*pAddr)[n_old];
+              ++n_new;
+              ++n_old;
+              hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
+            } // threads loop
+            if (nTr > 0) {
+              ++nCr; // num cores per socket
+              ++nCo; // total num cores
+              if (nTr > nTpC)
+                nTpC = nTr; // calc max threads per core
+            }
+            hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
+          } // cores loop
+        } // tiles support
+      } // numa_support
+      if (nCr > 0) { // found cores?
+        ++nPkg; // num sockets
+        if (nCr > nCpP)
+          nCpP = nCr; // calc max cores per socket
+      }
+    } // sockets loop
+
+    // check the subset is valid
+    KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
+    KMP_DEBUG_ASSERT(nPkg > 0);
+    KMP_DEBUG_ASSERT(nCpP > 0);
+    KMP_DEBUG_ASSERT(nTpC > 0);
+    KMP_DEBUG_ASSERT(nCo > 0);
+    KMP_DEBUG_ASSERT(nPkg <= nPackages);
+    KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
+    KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
+    KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
+
+    nPackages = nPkg; // correct num sockets
+    nCoresPerPkg = nCpP; // correct num cores per socket
+    __kmp_nThreadsPerCore = nTpC; // correct num threads per core
+    __kmp_avail_proc = n_new; // correct num procs
+    __kmp_ncores = nCo; // correct num cores
+    // hwloc topology method end
+  } else
+#endif // KMP_USE_HWLOC
+  {
+    int n_old = 0, n_new = 0, proc_num = 0;
+    if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
+      KMP_WARNING(AffHWSubsetNoHWLOC);
+      goto _exit;
+    }
+    if (__kmp_hws_socket.num == 0)
+      __kmp_hws_socket.num = nPackages; // use all available sockets
+    if (__kmp_hws_core.num == 0)
+      __kmp_hws_core.num = nCoresPerPkg; // use all available cores
+    if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
+      __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
+    if (!__kmp_affinity_uniform_topology()) {
+      KMP_WARNING(AffHWSubsetNonUniform);
+      goto _exit; // don't support non-uniform topology
+    }
+    if (depth > 3) {
+      KMP_WARNING(AffHWSubsetNonThreeLevel);
+      goto _exit; // don't support not-3-level topology
+    }
+    if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
+      KMP_WARNING(AffHWSubsetManySockets);
+      goto _exit;
+    }
+    if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
+      KMP_WARNING(AffHWSubsetManyCores);
+      goto _exit;
+    }
+    // Form the requested subset
+    if (pAddr) // pAddr is NULL in case of affinity_none
+      newAddr = (AddrUnsPair *)__kmp_allocate(
+          sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
+          __kmp_hws_proc.num);
+    for (int i = 0; i < nPackages; ++i) {
+      if (i < __kmp_hws_socket.offset ||
+          i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
+        // skip not-requested socket
+        n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
+        if (__kmp_pu_os_idx != NULL) {
+          // walk through skipped socket
+          for (int j = 0; j < nCoresPerPkg; ++j) {
+            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+              KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+              ++proc_num;
+            }
+          }
+        }
+      } else {
+        // walk through requested socket
+        for (int j = 0; j < nCoresPerPkg; ++j) {
+          if (j < __kmp_hws_core.offset ||
+              j >= __kmp_hws_core.offset +
+                       __kmp_hws_core.num) { // skip not-requested core
+            n_old += __kmp_nThreadsPerCore;
+            if (__kmp_pu_os_idx != NULL) {
+              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+                ++proc_num;
+              }
+            }
+          } else {
+            // walk through requested core
+            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+              if (k < __kmp_hws_proc.num) {
+                if (pAddr) // collect requested thread's data
+                  newAddr[n_new] = (*pAddr)[n_old];
+                n_new++;
+              } else {
+                if (__kmp_pu_os_idx != NULL)
+                  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+              }
+              n_old++;
+              ++proc_num;
+            }
+          }
+        }
+      }
+    }
+    KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
+    KMP_DEBUG_ASSERT(n_new ==
+                     __kmp_hws_socket.num * __kmp_hws_core.num *
+                         __kmp_hws_proc.num);
+    nPackages = __kmp_hws_socket.num; // correct nPackages
+    nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
+    __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
+    __kmp_avail_proc = n_new; // correct avail_proc
+    __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
+  } // non-hwloc topology method
+  if (pAddr) {
+    __kmp_free(*pAddr);
+    *pAddr = newAddr; // replace old topology with new one
+  }
+  if (__kmp_affinity_verbose) {
+    char m[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
+                              __kmp_affin_fullMask);
+    if (__kmp_affinity_respect_mask) {
+      KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
+    } else {
+      KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
+    }
+    KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    __kmp_str_buf_print(&buf, "%d", nPackages);
+    KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
+               __kmp_nThreadsPerCore, __kmp_ncores);
+    __kmp_str_buf_free(&buf);
+  }
+_exit:
+  if (__kmp_pu_os_idx != NULL) {
+    __kmp_free(__kmp_pu_os_idx);
+    __kmp_pu_os_idx = NULL;
+  }
+}
+
+// This function figures out the deepest level at which there is at least one
+// cluster/core with more than one processing unit bound to it.
+static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
+                                          int nprocs, int bottom_level) {
+  int core_level = 0;
+
+  for (int i = 0; i < nprocs; i++) {
+    for (int j = bottom_level; j > 0; j--) {
+      if (address2os[i].first.labels[j] > 0) {
+        if (core_level < (j - 1)) {
+          core_level = j - 1;
+        }
+      }
+    }
+  }
+  return core_level;
+}
+
+// This function counts number of clusters/cores at given level.
+static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
+                                         int nprocs, int bottom_level,
+                                         int core_level) {
+  int ncores = 0;
+  int i, j;
+
+  j = bottom_level;
+  for (i = 0; i < nprocs; i++) {
+    for (j = bottom_level; j > core_level; j--) {
+      if ((i + 1) < nprocs) {
+        if (address2os[i + 1].first.labels[j] > 0) {
+          break;
+        }
+      }
+    }
+    if (j == core_level) {
+      ncores++;
+    }
+  }
+  if (j > core_level) {
+    // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
+    // core. May occur when called from __kmp_affinity_find_core().
+    ncores++;
+  }
+  return ncores;
+}
+
+// This function finds to which cluster/core given processing unit is bound.
+static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
+                                    int bottom_level, int core_level) {
+  return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
+                                       core_level) -
+         1;
+}
+
+// This function finds maximal number of processing units bound to a
+// cluster/core at given level.
+static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
+                                            int nprocs, int bottom_level,
+                                            int core_level) {
+  int maxprocpercore = 0;
+
+  if (core_level < bottom_level) {
+    for (int i = 0; i < nprocs; i++) {
+      int percore = address2os[i].first.labels[core_level + 1] + 1;
+
+      if (percore > maxprocpercore) {
+        maxprocpercore = percore;
+      }
+    }
+  } else {
+    maxprocpercore = 1;
+  }
+  return maxprocpercore;
+}
+
+static AddrUnsPair *address2os = NULL;
+static int *procarr = NULL;
+static int __kmp_aff_depth = 0;
+
+#if KMP_USE_HIER_SCHED
+#define KMP_EXIT_AFF_NONE                                                      \
+  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
+  KMP_ASSERT(address2os == NULL);                                              \
+  __kmp_apply_thread_places(NULL, 0);                                          \
+  __kmp_create_affinity_none_places();                                         \
+  __kmp_dispatch_set_hierarchy_values();                                       \
+  return;
+#else
+#define KMP_EXIT_AFF_NONE                                                      \
+  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
+  KMP_ASSERT(address2os == NULL);                                              \
+  __kmp_apply_thread_places(NULL, 0);                                          \
+  __kmp_create_affinity_none_places();                                         \
+  return;
+#endif
+
+// Create a one element mask array (set of places) which only contains the
+// initial process's affinity mask
+static void __kmp_create_affinity_none_places() {
+  KMP_ASSERT(__kmp_affin_fullMask != NULL);
+  KMP_ASSERT(__kmp_affinity_type == affinity_none);
+  __kmp_affinity_num_masks = 1;
+  KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
+  kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
+  KMP_CPU_COPY(dest, __kmp_affin_fullMask);
+}
+
+static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
+  const Address *aa = &(((const AddrUnsPair *)a)->first);
+  const Address *bb = &(((const AddrUnsPair *)b)->first);
+  unsigned depth = aa->depth;
+  unsigned i;
+  KMP_DEBUG_ASSERT(depth == bb->depth);
+  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
+  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
+  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
+    int j = depth - i - 1;
+    if (aa->childNums[j] < bb->childNums[j])
+      return -1;
+    if (aa->childNums[j] > bb->childNums[j])
+      return 1;
+  }
+  for (; i < depth; i++) {
+    int j = i - __kmp_affinity_compact;
+    if (aa->childNums[j] < bb->childNums[j])
+      return -1;
+    if (aa->childNums[j] > bb->childNums[j])
+      return 1;
+  }
+  return 0;
+}
+
+static void __kmp_aux_affinity_initialize(void) {
+  if (__kmp_affinity_masks != NULL) {
+    KMP_ASSERT(__kmp_affin_fullMask != NULL);
+    return;
+  }
+
+  // Create the "full" mask - this defines all of the processors that we
+  // consider to be in the machine model. If respect is set, then it is the
+  // initialization thread's affinity mask. Otherwise, it is all processors that
+  // we know about on the machine.
+  if (__kmp_affin_fullMask == NULL) {
+    KMP_CPU_ALLOC(__kmp_affin_fullMask);
+  }
+  if (KMP_AFFINITY_CAPABLE()) {
+    if (__kmp_affinity_respect_mask) {
+      __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
+
+      // Count the number of available processors.
+      unsigned i;
+      __kmp_avail_proc = 0;
+      KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
+        if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
+          continue;
+        }
+        __kmp_avail_proc++;
+      }
+      if (__kmp_avail_proc > __kmp_xproc) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none))) {
+          KMP_WARNING(ErrorInitializeAffinity);
+        }
+        __kmp_affinity_type = affinity_none;
+        KMP_AFFINITY_DISABLE();
+        return;
+      }
+    } else {
+      __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
+      __kmp_avail_proc = __kmp_xproc;
+    }
+  }
+
+  if (__kmp_affinity_gran == affinity_gran_tile &&
+      // check if user's request is valid
+      __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
+    KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
+    __kmp_affinity_gran = affinity_gran_package;
+  }
+
+  int depth = -1;
+  kmp_i18n_id_t msg_id = kmp_i18n_null;
+
+  // For backward compatibility, setting KMP_CPUINFO_FILE =>
+  // KMP_TOPOLOGY_METHOD=cpuinfo
+  if ((__kmp_cpuinfo_file != NULL) &&
+      (__kmp_affinity_top_method == affinity_top_method_all)) {
+    __kmp_affinity_top_method = affinity_top_method_cpuinfo;
+  }
+
+  if (__kmp_affinity_top_method == affinity_top_method_all) {
+    // In the default code path, errors are not fatal - we just try using
+    // another method. We only emit a warning message if affinity is on, or the
+    // verbose flag is set, an the nowarnings flag was not set.
+    const char *file_name = NULL;
+    int line = 0;
+#if KMP_USE_HWLOC
+    if (depth < 0 &&
+        __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
+      if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+      }
+      if (!__kmp_hwloc_error) {
+        depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+        if (depth == 0) {
+          KMP_EXIT_AFF_NONE;
+        } else if (depth < 0 && __kmp_affinity_verbose) {
+          KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+        }
+      } else if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
+      }
+    }
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+    if (depth < 0) {
+      if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+      }
+
+      file_name = NULL;
+      depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
+      if (depth == 0) {
+        KMP_EXIT_AFF_NONE;
+      }
+
+      if (depth < 0) {
+        if (__kmp_affinity_verbose) {
+          if (msg_id != kmp_i18n_null) {
+            KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
+                       __kmp_i18n_catgets(msg_id),
+                       KMP_I18N_STR(DecodingLegacyAPIC));
+          } else {
+            KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
+                       KMP_I18N_STR(DecodingLegacyAPIC));
+          }
+        }
+
+        file_name = NULL;
+        depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+        if (depth == 0) {
+          KMP_EXIT_AFF_NONE;
+        }
+      }
+    }
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_OS_LINUX
+
+    if (depth < 0) {
+      if (__kmp_affinity_verbose) {
+        if (msg_id != kmp_i18n_null) {
+          KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
+                     __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
+        } else {
+          KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
+        }
+      }
+
+      FILE *f = fopen("/proc/cpuinfo", "r");
+      if (f == NULL) {
+        msg_id = kmp_i18n_str_CantOpenCpuinfo;
+      } else {
+        file_name = "/proc/cpuinfo";
+        depth =
+            __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
+        fclose(f);
+        if (depth == 0) {
+          KMP_EXIT_AFF_NONE;
+        }
+      }
+    }
+
+#endif /* KMP_OS_LINUX */
+
+#if KMP_GROUP_AFFINITY
+
+    if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
+      if (__kmp_affinity_verbose) {
+        KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+      }
+
+      depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
+      KMP_ASSERT(depth != 0);
+    }
+
+#endif /* KMP_GROUP_AFFINITY */
+
+    if (depth < 0) {
+      if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
+        if (file_name == NULL) {
+          KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
+        } else if (line == 0) {
+          KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
+        } else {
+          KMP_INFORM(UsingFlatOSFileLine, file_name, line,
+                     __kmp_i18n_catgets(msg_id));
+        }
+      }
+      // FIXME - print msg if msg_id = kmp_i18n_null ???
+
+      file_name = "";
+      depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
+      if (depth == 0) {
+        KMP_EXIT_AFF_NONE;
+      }
+      KMP_ASSERT(depth > 0);
+      KMP_ASSERT(address2os != NULL);
+    }
+  }
+
+#if KMP_USE_HWLOC
+  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
+    KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
+    }
+    depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+  }
+#endif // KMP_USE_HWLOC
+
+// If the user has specified that a paricular topology discovery method is to be
+// used, then we abort if that method fails. The exception is group affinity,
+// which might have been implicitly set.
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
+    }
+
+    depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+    }
+
+    depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  }
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
+    const char *filename;
+    if (__kmp_cpuinfo_file != NULL) {
+      filename = __kmp_cpuinfo_file;
+    } else {
+      filename = "/proc/cpuinfo";
+    }
+
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
+    }
+
+    FILE *f = fopen(filename, "r");
+    if (f == NULL) {
+      int code = errno;
+      if (__kmp_cpuinfo_file != NULL) {
+        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
+                    KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null);
+      } else {
+        __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
+                    __kmp_msg_null);
+      }
+    }
+    int line = 0;
+    depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
+    fclose(f);
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      if (line > 0) {
+        KMP_FATAL(FileLineMsgExiting, filename, line,
+                  __kmp_i18n_catgets(msg_id));
+      } else {
+        KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
+      }
+    }
+    if (__kmp_affinity_type == affinity_none) {
+      KMP_ASSERT(depth == 0);
+      KMP_EXIT_AFF_NONE;
+    }
+  }
+
+#if KMP_GROUP_AFFINITY
+
+  else if (__kmp_affinity_top_method == affinity_top_method_group) {
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+    }
+
+    depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
+    KMP_ASSERT(depth != 0);
+    if (depth < 0) {
+      KMP_ASSERT(msg_id != kmp_i18n_null);
+      KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
+    }
+  }
+
+#endif /* KMP_GROUP_AFFINITY */
+
+  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
+    if (__kmp_affinity_verbose) {
+      KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
+    }
+
+    depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
+    if (depth == 0) {
+      KMP_EXIT_AFF_NONE;
+    }
+    // should not fail
+    KMP_ASSERT(depth > 0);
+    KMP_ASSERT(address2os != NULL);
+  }
+
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_set_hierarchy_values();
+#endif
+
+  if (address2os == NULL) {
+    if (KMP_AFFINITY_CAPABLE() &&
+        (__kmp_affinity_verbose ||
+         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
+      KMP_WARNING(ErrorInitializeAffinity);
+    }
+    __kmp_affinity_type = affinity_none;
+    __kmp_create_affinity_none_places();
+    KMP_AFFINITY_DISABLE();
+    return;
+  }
+
+  if (__kmp_affinity_gran == affinity_gran_tile
+#if KMP_USE_HWLOC
+      && __kmp_tile_depth == 0
+#endif
+      ) {
+    // tiles requested but not detected, warn user on this
+    KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
+  }
+
+  __kmp_apply_thread_places(&address2os, depth);
+
+  // Create the table of masks, indexed by thread Id.
+  unsigned maxIndex;
+  unsigned numUnique;
+  kmp_affin_mask_t *osId2Mask =
+      __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
+  if (__kmp_affinity_gran_levels == 0) {
+    KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
+  }
+
+  // Set the childNums vector in all Address objects. This must be done before
+  // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
+  // account the setting of __kmp_affinity_compact.
+  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
+
+  switch (__kmp_affinity_type) {
+
+  case affinity_explicit:
+    KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
+    if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
+      __kmp_affinity_process_proclist(
+          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
+          __kmp_affinity_proclist, osId2Mask, maxIndex);
+    } else {
+      __kmp_affinity_process_placelist(
+          &__kmp_affinity_masks, &__kmp_affinity_num_masks,
+          __kmp_affinity_proclist, osId2Mask, maxIndex);
+    }
+    if (__kmp_affinity_num_masks == 0) {
+      if (__kmp_affinity_verbose ||
+          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
+        KMP_WARNING(AffNoValidProcID);
+      }
+      __kmp_affinity_type = affinity_none;
+      __kmp_create_affinity_none_places();
+      return;
+    }
+    break;
+
+  // The other affinity types rely on sorting the Addresses according to some
+  // permutation of the machine topology tree. Set __kmp_affinity_compact and
+  // __kmp_affinity_offset appropriately, then jump to a common code fragment
+  // to do the sort and create the array of affinity masks.
+
+  case affinity_logical:
+    __kmp_affinity_compact = 0;
+    if (__kmp_affinity_offset) {
+      __kmp_affinity_offset =
+          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+    }
+    goto sortAddresses;
+
+  case affinity_physical:
+    if (__kmp_nThreadsPerCore > 1) {
+      __kmp_affinity_compact = 1;
+      if (__kmp_affinity_compact >= depth) {
+        __kmp_affinity_compact = 0;
+      }
+    } else {
+      __kmp_affinity_compact = 0;
+    }
+    if (__kmp_affinity_offset) {
+      __kmp_affinity_offset =
+          __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
+    }
+    goto sortAddresses;
+
+  case affinity_scatter:
+    if (__kmp_affinity_compact >= depth) {
+      __kmp_affinity_compact = 0;
+    } else {
+      __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
+    }
+    goto sortAddresses;
+
+  case affinity_compact:
+    if (__kmp_affinity_compact >= depth) {
+      __kmp_affinity_compact = depth - 1;
+    }
+    goto sortAddresses;
+
+  case affinity_balanced:
+    if (depth <= 1) {
+      if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
+        KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
+      }
+      __kmp_affinity_type = affinity_none;
+      __kmp_create_affinity_none_places();
+      return;
+    } else if (!__kmp_affinity_uniform_topology()) {
+      // Save the depth for further usage
+      __kmp_aff_depth = depth;
+
+      int core_level = __kmp_affinity_find_core_level(
+          address2os, __kmp_avail_proc, depth - 1);
+      int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
+                                                 depth - 1, core_level);
+      int maxprocpercore = __kmp_affinity_max_proc_per_core(
+          address2os, __kmp_avail_proc, depth - 1, core_level);
+
+      int nproc = ncores * maxprocpercore;
+      if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
+        if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
+          KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
+        }
+        __kmp_affinity_type = affinity_none;
+        return;
+      }
+
+      procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
+      for (int i = 0; i < nproc; i++) {
+        procarr[i] = -1;
+      }
+
+      int lastcore = -1;
+      int inlastcore = 0;
+      for (int i = 0; i < __kmp_avail_proc; i++) {
+        int proc = address2os[i].second;
+        int core =
+            __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
+
+        if (core == lastcore) {
+          inlastcore++;
+        } else {
+          inlastcore = 0;
+        }
+        lastcore = core;
+
+        procarr[core * maxprocpercore + inlastcore] = proc;
+      }
+    }
+    if (__kmp_affinity_compact >= depth) {
+      __kmp_affinity_compact = depth - 1;
+    }
+
+  sortAddresses:
+    // Allocate the gtid->affinity mask table.
+    if (__kmp_affinity_dups) {
+      __kmp_affinity_num_masks = __kmp_avail_proc;
+    } else {
+      __kmp_affinity_num_masks = numUnique;
+    }
+
+    if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
+        (__kmp_affinity_num_places > 0) &&
+        ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
+      __kmp_affinity_num_masks = __kmp_affinity_num_places;
+    }
+
+    KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
+
+    // Sort the address2os table according to the current setting of
+    // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
+    qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
+          __kmp_affinity_cmp_Address_child_num);
+    {
+      int i;
+      unsigned j;
+      for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
+        if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
+          continue;
+        }
+        unsigned osId = address2os[i].second;
+        kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
+        kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
+        KMP_ASSERT(KMP_CPU_ISSET(osId, src));
+        KMP_CPU_COPY(dest, src);
+        if (++j >= __kmp_affinity_num_masks) {
+          break;
+        }
+      }
+      KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
+    }
+    break;
+
+  default:
+    KMP_ASSERT2(0, "Unexpected affinity setting");
+  }
+
+  KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
+  machine_hierarchy.init(address2os, __kmp_avail_proc);
+}
+#undef KMP_EXIT_AFF_NONE
+
+void __kmp_affinity_initialize(void) {
+  // Much of the code above was written assumming that if a machine was not
+  // affinity capable, then __kmp_affinity_type == affinity_none.  We now
+  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
+  // There are too many checks for __kmp_affinity_type == affinity_none
+  // in this code.  Instead of trying to change them all, check if
+  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
+  // affinity_none, call the real initialization routine, then restore
+  // __kmp_affinity_type to affinity_disabled.
+  int disabled = (__kmp_affinity_type == affinity_disabled);
+  if (!KMP_AFFINITY_CAPABLE()) {
+    KMP_ASSERT(disabled);
+  }
+  if (disabled) {
+    __kmp_affinity_type = affinity_none;
+  }
+  __kmp_aux_affinity_initialize();
+  if (disabled) {
+    __kmp_affinity_type = affinity_disabled;
+  }
+}
+
+void __kmp_affinity_uninitialize(void) {
+  if (__kmp_affinity_masks != NULL) {
+    KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
+    __kmp_affinity_masks = NULL;
+  }
+  if (__kmp_affin_fullMask != NULL) {
+    KMP_CPU_FREE(__kmp_affin_fullMask);
+    __kmp_affin_fullMask = NULL;
+  }
+  __kmp_affinity_num_masks = 0;
+  __kmp_affinity_type = affinity_default;
+  __kmp_affinity_num_places = 0;
+  if (__kmp_affinity_proclist != NULL) {
+    __kmp_free(__kmp_affinity_proclist);
+    __kmp_affinity_proclist = NULL;
+  }
+  if (address2os != NULL) {
+    __kmp_free(address2os);
+    address2os = NULL;
+  }
+  if (procarr != NULL) {
+    __kmp_free(procarr);
+    procarr = NULL;
+  }
+#if KMP_USE_HWLOC
+  if (__kmp_hwloc_topology != NULL) {
+    hwloc_topology_destroy(__kmp_hwloc_topology);
+    __kmp_hwloc_topology = NULL;
+  }
+#endif
+  KMPAffinity::destroy_api();
+}
+
+void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
+  }
+
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+  if (th->th.th_affin_mask == NULL) {
+    KMP_CPU_ALLOC(th->th.th_affin_mask);
+  } else {
+    KMP_CPU_ZERO(th->th.th_affin_mask);
+  }
+
+  // Copy the thread mask to the kmp_info_t strucuture. If
+  // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
+  // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
+  // then the full mask is the same as the mask of the initialization thread.
+  kmp_affin_mask_t *mask;
+  int i;
+
+  if (KMP_AFFINITY_NON_PROC_BIND) {
+    if ((__kmp_affinity_type == affinity_none) ||
+        (__kmp_affinity_type == affinity_balanced)) {
+#if KMP_GROUP_AFFINITY
+      if (__kmp_num_proc_groups > 1) {
+        return;
+      }
+#endif
+      KMP_ASSERT(__kmp_affin_fullMask != NULL);
+      i = 0;
+      mask = __kmp_affin_fullMask;
+    } else {
+      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
+      i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+    }
+  } else {
+    if ((!isa_root) ||
+        (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+#if KMP_GROUP_AFFINITY
+      if (__kmp_num_proc_groups > 1) {
+        return;
+      }
+#endif
+      KMP_ASSERT(__kmp_affin_fullMask != NULL);
+      i = KMP_PLACE_ALL;
+      mask = __kmp_affin_fullMask;
+    } else {
+      // int i = some hash function or just a counter that doesn't
+      // always start at 0.  Use gtid for now.
+      KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
+      i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
+      mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
+    }
+  }
+
+  th->th.th_current_place = i;
+  if (isa_root) {
+    th->th.th_new_place = i;
+    th->th.th_first_place = 0;
+    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+  } else if (KMP_AFFINITY_NON_PROC_BIND) {
+    // When using a Non-OMP_PROC_BIND affinity method,
+    // set all threads' place-partition-var to the entire place list
+    th->th.th_first_place = 0;
+    th->th.th_last_place = __kmp_affinity_num_masks - 1;
+  }
+
+  if (i == KMP_PLACE_ALL) {
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
+                   gtid));
+  } else {
+    KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
+                   gtid, i));
+  }
+
+  KMP_CPU_COPY(th->th.th_affin_mask, mask);
+
+  if (__kmp_affinity_verbose
+      /* to avoid duplicate printing (will be correctly printed on barrier) */
+      && (__kmp_affinity_type == affinity_none ||
+          (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+               __kmp_gettid(), gtid, buf);
+  }
+
+#if KMP_OS_WINDOWS
+  // On Windows* OS, the process affinity mask might have changed. If the user
+  // didn't request affinity and this call fails, just continue silently.
+  // See CQ171393.
+  if (__kmp_affinity_type == affinity_none) {
+    __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
+  } else
+#endif
+    __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+}
+
+void __kmp_affinity_set_place(int gtid) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return;
+  }
+
+  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+
+  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
+                 "place = %d)\n",
+                 gtid, th->th.th_new_place, th->th.th_current_place));
+
+  // Check that the new place is within this thread's partition.
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+  KMP_ASSERT(th->th.th_new_place >= 0);
+  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
+  if (th->th.th_first_place <= th->th.th_last_place) {
+    KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
+               (th->th.th_new_place <= th->th.th_last_place));
+  } else {
+    KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
+               (th->th.th_new_place >= th->th.th_last_place));
+  }
+
+  // Copy the thread mask to the kmp_info_t strucuture,
+  // and set this thread's affinity.
+  kmp_affin_mask_t *mask =
+      KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
+  KMP_CPU_COPY(th->th.th_affin_mask, mask);
+  th->th.th_current_place = th->th.th_new_place;
+
+  if (__kmp_affinity_verbose) {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
+               __kmp_gettid(), gtid, buf);
+  }
+  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+}
+
+int __kmp_aux_set_affinity(void **mask) {
+  int gtid;
+  kmp_info_t *th;
+  int retval;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  gtid = __kmp_entry_gtid();
+  KA_TRACE(1000, (""); {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf(
+        "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
+        buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+    } else {
+      unsigned proc;
+      int num_procs = 0;
+
+      KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
+        if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+          KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+        }
+        if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
+          continue;
+        }
+        num_procs++;
+      }
+      if (num_procs == 0) {
+        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+      }
+
+#if KMP_GROUP_AFFINITY
+      if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
+        KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+      }
+#endif /* KMP_GROUP_AFFINITY */
+    }
+  }
+
+  th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+  if (retval == 0) {
+    KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
+  }
+
+  th->th.th_current_place = KMP_PLACE_UNDEFINED;
+  th->th.th_new_place = KMP_PLACE_UNDEFINED;
+  th->th.th_first_place = 0;
+  th->th.th_last_place = __kmp_affinity_num_masks - 1;
+
+  // Turn off 4.0 affinity for the current tread at this parallel level.
+  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
+
+  return retval;
+}
+
+int __kmp_aux_get_affinity(void **mask) {
+  int gtid;
+  int retval;
+  kmp_info_t *th;
+
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  gtid = __kmp_entry_gtid();
+  th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
+
+  KA_TRACE(1000, (""); {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              th->th.th_affin_mask);
+    __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
+                 gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
+    }
+  }
+
+#if !KMP_OS_WINDOWS
+
+  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
+  KA_TRACE(1000, (""); {
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
+                 gtid, buf);
+  });
+  return retval;
+
+#else
+
+  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
+  return 0;
+
+#endif /* KMP_OS_WINDOWS */
+}
+
+int __kmp_aux_get_affinity_max_proc() {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return 0;
+  }
+#if KMP_GROUP_AFFINITY
+  if (__kmp_num_proc_groups > 1) {
+    return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
+  }
+#endif
+  return __kmp_xproc;
+}
+
+int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(1000, (""); {
+    int gtid = __kmp_entry_gtid();
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
+                       "affinity mask for thread %d = %s\n",
+                       proc, gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return -2;
+  }
+
+  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
+  return 0;
+}
+
+int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(1000, (""); {
+    int gtid = __kmp_entry_gtid();
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
+                       "affinity mask for thread %d = %s\n",
+                       proc, gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return -2;
+  }
+
+  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
+  return 0;
+}
+
+int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
+  if (!KMP_AFFINITY_CAPABLE()) {
+    return -1;
+  }
+
+  KA_TRACE(1000, (""); {
+    int gtid = __kmp_entry_gtid();
+    char buf[KMP_AFFIN_MASK_PRINT_LEN];
+    __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
+                              (kmp_affin_mask_t *)(*mask));
+    __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
+                       "affinity mask for thread %d = %s\n",
+                       proc, gtid, buf);
+  });
+
+  if (__kmp_env_consistency_check) {
+    if ((mask == NULL) || (*mask == NULL)) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
+    }
+  }
+
+  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
+    return -1;
+  }
+  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
+    return 0;
+  }
+
+  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
+}
+
+// Dynamic affinity settings - Affinity balanced
+void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
+  KMP_DEBUG_ASSERT(th);
+  bool fine_gran = true;
+  int tid = th->th.th_info.ds.ds_tid;
+
+  switch (__kmp_affinity_gran) {
+  case affinity_gran_fine:
+  case affinity_gran_thread:
+    break;
+  case affinity_gran_core:
+    if (__kmp_nThreadsPerCore > 1) {
+      fine_gran = false;
+    }
+    break;
+  case affinity_gran_package:
+    if (nCoresPerPkg > 1) {
+      fine_gran = false;
+    }
+    break;
+  default:
+    fine_gran = false;
+  }
+
+  if (__kmp_affinity_uniform_topology()) {
+    int coreID;
+    int threadID;
+    // Number of hyper threads per core in HT machine
+    int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
+    // Number of cores
+    int ncores = __kmp_ncores;
+    if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
+      __kmp_nth_per_core = __kmp_avail_proc / nPackages;
+      ncores = nPackages;
+    }
+    // How many threads will be bound to each core
+    int chunk = nthreads / ncores;
+    // How many cores will have an additional thread bound to it - "big cores"
+    int big_cores = nthreads % ncores;
+    // Number of threads on the big cores
+    int big_nth = (chunk + 1) * big_cores;
+    if (tid < big_nth) {
+      coreID = tid / (chunk + 1);
+      threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
+    } else { // tid >= big_nth
+      coreID = (tid - big_cores) / chunk;
+      threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
+    }
+
+    KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                      "Illegal set affinity operation when not capable");
+
+    kmp_affin_mask_t *mask = th->th.th_affin_mask;
+    KMP_CPU_ZERO(mask);
+
+    if (fine_gran) {
+      int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
+      KMP_CPU_SET(osID, mask);
+    } else {
+      for (int i = 0; i < __kmp_nth_per_core; i++) {
+        int osID;
+        osID = address2os[coreID * __kmp_nth_per_core + i].second;
+        KMP_CPU_SET(osID, mask);
+      }
+    }
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+                 __kmp_gettid(), tid, buf);
+    }
+    __kmp_set_system_affinity(mask, TRUE);
+  } else { // Non-uniform topology
+
+    kmp_affin_mask_t *mask = th->th.th_affin_mask;
+    KMP_CPU_ZERO(mask);
+
+    int core_level = __kmp_affinity_find_core_level(
+        address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
+    int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
+                                               __kmp_aff_depth - 1, core_level);
+    int nth_per_core = __kmp_affinity_max_proc_per_core(
+        address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
+
+    // For performance gain consider the special case nthreads ==
+    // __kmp_avail_proc
+    if (nthreads == __kmp_avail_proc) {
+      if (fine_gran) {
+        int osID = address2os[tid].second;
+        KMP_CPU_SET(osID, mask);
+      } else {
+        int core = __kmp_affinity_find_core(address2os, tid,
+                                            __kmp_aff_depth - 1, core_level);
+        for (int i = 0; i < __kmp_avail_proc; i++) {
+          int osID = address2os[i].second;
+          if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
+                                       core_level) == core) {
+            KMP_CPU_SET(osID, mask);
+          }
+        }
+      }
+    } else if (nthreads <= ncores) {
+
+      int core = 0;
+      for (int i = 0; i < ncores; i++) {
+        // Check if this core from procarr[] is in the mask
+        int in_mask = 0;
+        for (int j = 0; j < nth_per_core; j++) {
+          if (procarr[i * nth_per_core + j] != -1) {
+            in_mask = 1;
+            break;
+          }
+        }
+        if (in_mask) {
+          if (tid == core) {
+            for (int j = 0; j < nth_per_core; j++) {
+              int osID = procarr[i * nth_per_core + j];
+              if (osID != -1) {
+                KMP_CPU_SET(osID, mask);
+                // For fine granularity it is enough to set the first available
+                // osID for this core
+                if (fine_gran) {
+                  break;
+                }
+              }
+            }
+            break;
+          } else {
+            core++;
+          }
+        }
+      }
+    } else { // nthreads > ncores
+      // Array to save the number of processors at each core
+      int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
+      // Array to save the number of cores with "x" available processors;
+      int *ncores_with_x_procs =
+          (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
+      // Array to save the number of cores with # procs from x to nth_per_core
+      int *ncores_with_x_to_max_procs =
+          (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
+
+      for (int i = 0; i <= nth_per_core; i++) {
+        ncores_with_x_procs[i] = 0;
+        ncores_with_x_to_max_procs[i] = 0;
+      }
+
+      for (int i = 0; i < ncores; i++) {
+        int cnt = 0;
+        for (int j = 0; j < nth_per_core; j++) {
+          if (procarr[i * nth_per_core + j] != -1) {
+            cnt++;
+          }
+        }
+        nproc_at_core[i] = cnt;
+        ncores_with_x_procs[cnt]++;
+      }
+
+      for (int i = 0; i <= nth_per_core; i++) {
+        for (int j = i; j <= nth_per_core; j++) {
+          ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
+        }
+      }
+
+      // Max number of processors
+      int nproc = nth_per_core * ncores;
+      // An array to keep number of threads per each context
+      int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
+      for (int i = 0; i < nproc; i++) {
+        newarr[i] = 0;
+      }
+
+      int nth = nthreads;
+      int flag = 0;
+      while (nth > 0) {
+        for (int j = 1; j <= nth_per_core; j++) {
+          int cnt = ncores_with_x_to_max_procs[j];
+          for (int i = 0; i < ncores; i++) {
+            // Skip the core with 0 processors
+            if (nproc_at_core[i] == 0) {
+              continue;
+            }
+            for (int k = 0; k < nth_per_core; k++) {
+              if (procarr[i * nth_per_core + k] != -1) {
+                if (newarr[i * nth_per_core + k] == 0) {
+                  newarr[i * nth_per_core + k] = 1;
+                  cnt--;
+                  nth--;
+                  break;
+                } else {
+                  if (flag != 0) {
+                    newarr[i * nth_per_core + k]++;
+                    cnt--;
+                    nth--;
+                    break;
+                  }
+                }
+              }
+            }
+            if (cnt == 0 || nth == 0) {
+              break;
+            }
+          }
+          if (nth == 0) {
+            break;
+          }
+        }
+        flag = 1;
+      }
+      int sum = 0;
+      for (int i = 0; i < nproc; i++) {
+        sum += newarr[i];
+        if (sum > tid) {
+          if (fine_gran) {
+            int osID = procarr[i];
+            KMP_CPU_SET(osID, mask);
+          } else {
+            int coreID = i / nth_per_core;
+            for (int ii = 0; ii < nth_per_core; ii++) {
+              int osID = procarr[coreID * nth_per_core + ii];
+              if (osID != -1) {
+                KMP_CPU_SET(osID, mask);
+              }
+            }
+          }
+          break;
+        }
+      }
+      __kmp_free(newarr);
+    }
+
+    if (__kmp_affinity_verbose) {
+      char buf[KMP_AFFIN_MASK_PRINT_LEN];
+      __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
+      KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+                 __kmp_gettid(), tid, buf);
+    }
+    __kmp_set_system_affinity(mask, TRUE);
+  }
+}
+
+#if KMP_OS_LINUX
+// We don't need this entry for Windows because
+// there is GetProcessAffinityMask() api
+//
+// The intended usage is indicated by these steps:
+// 1) The user gets the current affinity mask
+// 2) Then sets the affinity by calling this function
+// 3) Error check the return value
+// 4) Use non-OpenMP parallelization
+// 5) Reset the affinity to what was stored in step 1)
+#ifdef __cplusplus
+extern "C"
+#endif
+    int
+    kmp_set_thread_affinity_mask_initial()
+// the function returns 0 on success,
+//   -1 if we cannot bind thread
+//   >0 (errno) if an error happened during binding
+{
+  int gtid = __kmp_get_gtid();
+  if (gtid < 0) {
+    // Do not touch non-omp threads
+    KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                  "non-omp thread, returning\n"));
+    return -1;
+  }
+  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
+    KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                  "affinity not initialized, returning\n"));
+    return -1;
+  }
+  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
+                "set full mask for thread %d\n",
+                gtid));
+  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+}
+#endif
+
+#endif // KMP_AFFINITY_SUPPORTED
diff --git a/final/runtime/src/kmp_affinity.h b/final/runtime/src/kmp_affinity.h
new file mode 100644
index 0000000..c00ad36
--- /dev/null
+++ b/final/runtime/src/kmp_affinity.h
@@ -0,0 +1,827 @@
+/*
+ * kmp_affinity.h -- header for affinity management
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_AFFINITY_H
+#define KMP_AFFINITY_H
+
+#include "kmp.h"
+#include "kmp_os.h"
+
+#if KMP_AFFINITY_SUPPORTED
+#if KMP_USE_HWLOC
+class KMPHwlocAffinity : public KMPAffinity {
+public:
+  class Mask : public KMPAffinity::Mask {
+    hwloc_cpuset_t mask;
+
+  public:
+    Mask() {
+      mask = hwloc_bitmap_alloc();
+      this->zero();
+    }
+    ~Mask() { hwloc_bitmap_free(mask); }
+    void set(int i) override { hwloc_bitmap_set(mask, i); }
+    bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
+    void clear(int i) override { hwloc_bitmap_clr(mask, i); }
+    void zero() override { hwloc_bitmap_zero(mask); }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      hwloc_bitmap_copy(mask, convert->mask);
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      hwloc_bitmap_and(mask, mask, convert->mask);
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      hwloc_bitmap_or(mask, mask, convert->mask);
+    }
+    void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
+    int begin() const override { return hwloc_bitmap_first(mask); }
+    int end() const override { return -1; }
+    int next(int previous) const override {
+      return hwloc_bitmap_next(mask, previous);
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+    int get_proc_group() const override {
+      int group = -1;
+#if KMP_OS_WINDOWS
+      if (__kmp_num_proc_groups == 1) {
+        return 1;
+      }
+      for (int i = 0; i < __kmp_num_proc_groups; i++) {
+        // On windows, the long type is always 32 bits
+        unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
+        unsigned long second_32_bits =
+            hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
+        if (first_32_bits == 0 && second_32_bits == 0) {
+          continue;
+        }
+        if (group >= 0) {
+          return -1;
+        }
+        group = i;
+      }
+#endif /* KMP_OS_WINDOWS */
+      return group;
+    }
+  };
+  void determine_capable(const char *var) override {
+    const hwloc_topology_support *topology_support;
+    if (__kmp_hwloc_topology == NULL) {
+      if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
+        __kmp_hwloc_error = TRUE;
+        if (__kmp_affinity_verbose)
+          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
+      }
+      if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
+        __kmp_hwloc_error = TRUE;
+        if (__kmp_affinity_verbose)
+          KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
+      }
+    }
+    topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
+    // Is the system capable of setting/getting this thread's affinity?
+    // Also, is topology discovery possible? (pu indicates ability to discover
+    // processing units). And finally, were there no errors when calling any
+    // hwloc_* API functions?
+    if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
+        topology_support->cpubind->get_thisthread_cpubind &&
+        topology_support->discovery->pu && !__kmp_hwloc_error) {
+      // enables affinity according to KMP_AFFINITY_CAPABLE() macro
+      KMP_AFFINITY_ENABLE(TRUE);
+    } else {
+      // indicate that hwloc didn't work and disable affinity
+      __kmp_hwloc_error = TRUE;
+      KMP_AFFINITY_DISABLE();
+    }
+  }
+  void bind_thread(int which) override {
+    KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                "Illegal set affinity operation when not capable");
+    KMPAffinity::Mask *mask;
+    KMP_CPU_ALLOC_ON_STACK(mask);
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(which, mask);
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE_FROM_STACK(mask);
+  }
+  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
+  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *hwloc_array = static_cast<Mask *>(array);
+    delete[] hwloc_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *hwloc_array = static_cast<Mask *>(array);
+    return &(hwloc_array[index]);
+  }
+  api_type get_api_type() const override { return HWLOC; }
+};
+#endif /* KMP_USE_HWLOC */
+
+#if KMP_OS_LINUX
+/* On some of the older OS's that we build on, these constants aren't present
+   in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
+   all systems of the same arch where they are defined, and they cannot change.
+   stone forever. */
+#include <sys/syscall.h>
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 241
+#elif __NR_sched_setaffinity != 241
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 242
+#elif __NR_sched_getaffinity != 242
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_AARCH64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 122
+#elif __NR_sched_setaffinity != 122
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 123
+#elif __NR_sched_getaffinity != 123
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_X86_64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#elif __NR_sched_setaffinity != 203
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 204
+#elif __NR_sched_getaffinity != 204
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_PPC64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 222
+#elif __NR_sched_setaffinity != 222
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 223
+#elif __NR_sched_getaffinity != 223
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_MIPS
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 4239
+#elif __NR_sched_setaffinity != 4239
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 4240
+#elif __NR_sched_getaffinity != 4240
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_MIPS64
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 5195
+#elif __NR_sched_setaffinity != 5195
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 5196
+#elif __NR_sched_getaffinity != 5196
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#error Unknown or unsupported architecture
+#endif /* KMP_ARCH_* */
+class KMPNativeAffinity : public KMPAffinity {
+  class Mask : public KMPAffinity::Mask {
+    typedef unsigned char mask_t;
+    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+
+  public:
+    mask_t *mask;
+    Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
+    ~Mask() {
+      if (mask)
+        __kmp_free(mask);
+    }
+    void set(int i) override {
+      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    bool is_set(int i) const override {
+      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
+    }
+    void clear(int i) override {
+      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    void zero() override {
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] = 0;
+    }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] = convert->mask[i];
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] &= convert->mask[i];
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] |= convert->mask[i];
+    }
+    void bitwise_not() override {
+      for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
+        mask[i] = ~(mask[i]);
+    }
+    int begin() const override {
+      int retval = 0;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
+    int next(int previous) const override {
+      int retval = previous + 1;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+                  "Illegal get affinity operation when not capable");
+      int retval =
+          syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
+      if (retval >= 0) {
+        return 0;
+      }
+      int error = errno;
+      if (abort_on_error) {
+        __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
+      }
+      return error;
+    }
+  };
+  void determine_capable(const char *env_var) override {
+    __kmp_affinity_determine_capable(env_var);
+  }
+  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
+  KMPAffinity::Mask *allocate_mask() override {
+    KMPNativeAffinity::Mask *retval = new Mask();
+    return retval;
+  }
+  void deallocate_mask(KMPAffinity::Mask *m) override {
+    KMPNativeAffinity::Mask *native_mask =
+        static_cast<KMPNativeAffinity::Mask *>(m);
+    delete native_mask;
+  }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *linux_array = static_cast<Mask *>(array);
+    delete[] linux_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *linux_array = static_cast<Mask *>(array);
+    return &(linux_array[index]);
+  }
+  api_type get_api_type() const override { return NATIVE_OS; }
+};
+#endif /* KMP_OS_LINUX */
+
+#if KMP_OS_WINDOWS
+class KMPNativeAffinity : public KMPAffinity {
+  class Mask : public KMPAffinity::Mask {
+    typedef ULONG_PTR mask_t;
+    static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
+    mask_t *mask;
+
+  public:
+    Mask() {
+      mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
+    }
+    ~Mask() {
+      if (mask)
+        __kmp_free(mask);
+    }
+    void set(int i) override {
+      mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    bool is_set(int i) const override {
+      return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
+    }
+    void clear(int i) override {
+      mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
+    }
+    void zero() override {
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = 0;
+    }
+    void copy(const KMPAffinity::Mask *src) override {
+      const Mask *convert = static_cast<const Mask *>(src);
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = convert->mask[i];
+    }
+    void bitwise_and(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] &= convert->mask[i];
+    }
+    void bitwise_or(const KMPAffinity::Mask *rhs) override {
+      const Mask *convert = static_cast<const Mask *>(rhs);
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] |= convert->mask[i];
+    }
+    void bitwise_not() override {
+      for (int i = 0; i < __kmp_num_proc_groups; ++i)
+        mask[i] = ~(mask[i]);
+    }
+    int begin() const override {
+      int retval = 0;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
+    int next(int previous) const override {
+      int retval = previous + 1;
+      while (retval < end() && !is_set(retval))
+        ++retval;
+      return retval;
+    }
+    int set_system_affinity(bool abort_on_error) const override {
+      if (__kmp_num_proc_groups > 1) {
+        // Check for a valid mask.
+        GROUP_AFFINITY ga;
+        int group = get_proc_group();
+        if (group < 0) {
+          if (abort_on_error) {
+            KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
+          }
+          return -1;
+        }
+        // Transform the bit vector into a GROUP_AFFINITY struct
+        // and make the system call to set affinity.
+        ga.Group = group;
+        ga.Mask = mask[group];
+        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
+
+        KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
+        if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
+                        __kmp_msg_null);
+          }
+          return error;
+        }
+      } else {
+        if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
+                        __kmp_msg_null);
+          }
+          return error;
+        }
+      }
+      return 0;
+    }
+    int get_system_affinity(bool abort_on_error) override {
+      if (__kmp_num_proc_groups > 1) {
+        this->zero();
+        GROUP_AFFINITY ga;
+        KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
+        if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
+            (ga.Mask == 0)) {
+          return -1;
+        }
+        mask[ga.Group] = ga.Mask;
+      } else {
+        mask_t newMask, sysMask, retval;
+        if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
+        if (!retval) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+          return error;
+        }
+        newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
+        if (!newMask) {
+          DWORD error = GetLastError();
+          if (abort_on_error) {
+            __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
+                        KMP_ERR(error), __kmp_msg_null);
+          }
+        }
+        *mask = retval;
+      }
+      return 0;
+    }
+    int get_proc_group() const override {
+      int group = -1;
+      if (__kmp_num_proc_groups == 1) {
+        return 1;
+      }
+      for (int i = 0; i < __kmp_num_proc_groups; i++) {
+        if (mask[i] == 0)
+          continue;
+        if (group >= 0)
+          return -1;
+        group = i;
+      }
+      return group;
+    }
+  };
+  void determine_capable(const char *env_var) override {
+    __kmp_affinity_determine_capable(env_var);
+  }
+  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
+  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
+  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
+  KMPAffinity::Mask *allocate_mask_array(int num) override {
+    return new Mask[num];
+  }
+  void deallocate_mask_array(KMPAffinity::Mask *array) override {
+    Mask *windows_array = static_cast<Mask *>(array);
+    delete[] windows_array;
+  }
+  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
+                                      int index) override {
+    Mask *windows_array = static_cast<Mask *>(array);
+    return &(windows_array[index]);
+  }
+  api_type get_api_type() const override { return NATIVE_OS; }
+};
+#endif /* KMP_OS_WINDOWS */
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+class Address {
+public:
+  static const unsigned maxDepth = 32;
+  unsigned labels[maxDepth];
+  unsigned childNums[maxDepth];
+  unsigned depth;
+  unsigned leader;
+  Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
+  Address &operator=(const Address &b) {
+    depth = b.depth;
+    for (unsigned i = 0; i < depth; i++) {
+      labels[i] = b.labels[i];
+      childNums[i] = b.childNums[i];
+    }
+    leader = FALSE;
+    return *this;
+  }
+  bool operator==(const Address &b) const {
+    if (depth != b.depth)
+      return false;
+    for (unsigned i = 0; i < depth; i++)
+      if (labels[i] != b.labels[i])
+        return false;
+    return true;
+  }
+  bool isClose(const Address &b, int level) const {
+    if (depth != b.depth)
+      return false;
+    if ((unsigned)level >= depth)
+      return true;
+    for (unsigned i = 0; i < (depth - level); i++)
+      if (labels[i] != b.labels[i])
+        return false;
+    return true;
+  }
+  bool operator!=(const Address &b) const { return !operator==(b); }
+  void print() const {
+    unsigned i;
+    printf("Depth: %u --- ", depth);
+    for (i = 0; i < depth; i++) {
+      printf("%u ", labels[i]);
+    }
+  }
+};
+
+class AddrUnsPair {
+public:
+  Address first;
+  unsigned second;
+  AddrUnsPair(Address _first, unsigned _second)
+      : first(_first), second(_second) {}
+  AddrUnsPair &operator=(const AddrUnsPair &b) {
+    first = b.first;
+    second = b.second;
+    return *this;
+  }
+  void print() const {
+    printf("first = ");
+    first.print();
+    printf(" --- second = %u", second);
+  }
+  bool operator==(const AddrUnsPair &b) const {
+    if (first != b.first)
+      return false;
+    if (second != b.second)
+      return false;
+    return true;
+  }
+  bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
+};
+
+static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
+  const Address *aa = &(((const AddrUnsPair *)a)->first);
+  const Address *bb = &(((const AddrUnsPair *)b)->first);
+  unsigned depth = aa->depth;
+  unsigned i;
+  KMP_DEBUG_ASSERT(depth == bb->depth);
+  for (i = 0; i < depth; i++) {
+    if (aa->labels[i] < bb->labels[i])
+      return -1;
+    if (aa->labels[i] > bb->labels[i])
+      return 1;
+  }
+  return 0;
+}
+
+/* A structure for holding machine-specific hierarchy info to be computed once
+   at init. This structure represents a mapping of threads to the actual machine
+   hierarchy, or to our best guess at what the hierarchy might be, for the
+   purpose of performing an efficient barrier. In the worst case, when there is
+   no machine hierarchy information, it produces a tree suitable for a barrier,
+   similar to the tree used in the hyper barrier. */
+class hierarchy_info {
+public:
+  /* Good default values for number of leaves and branching factor, given no
+     affinity information. Behaves a bit like hyper barrier. */
+  static const kmp_uint32 maxLeaves = 4;
+  static const kmp_uint32 minBranch = 4;
+  /** Number of levels in the hierarchy. Typical levels are threads/core,
+      cores/package or socket, packages/node, nodes/machine, etc. We don't want
+      to get specific with nomenclature. When the machine is oversubscribed we
+      add levels to duplicate the hierarchy, doubling the thread capacity of the
+      hierarchy each time we add a level. */
+  kmp_uint32 maxLevels;
+
+  /** This is specifically the depth of the machine configuration hierarchy, in
+      terms of the number of levels along the longest path from root to any
+      leaf. It corresponds to the number of entries in numPerLevel if we exclude
+      all but one trailing 1. */
+  kmp_uint32 depth;
+  kmp_uint32 base_num_threads;
+  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
+  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
+  // 2=initialization in progress
+  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
+
+  /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
+      the parent of a node at level i has. For example, if we have a machine
+      with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
+      {2, 4, 4, 1, 1}. All empty levels are set to 1. */
+  kmp_uint32 *numPerLevel;
+  kmp_uint32 *skipPerLevel;
+
+  void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
+    int hier_depth = adr2os[0].first.depth;
+    int level = 0;
+    for (int i = hier_depth - 1; i >= 0; --i) {
+      int max = -1;
+      for (int j = 0; j < num_addrs; ++j) {
+        int next = adr2os[j].first.childNums[i];
+        if (next > max)
+          max = next;
+      }
+      numPerLevel[level] = max + 1;
+      ++level;
+    }
+  }
+
+  hierarchy_info()
+      : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
+
+  void fini() {
+    if (!uninitialized && numPerLevel) {
+      __kmp_free(numPerLevel);
+      numPerLevel = NULL;
+      uninitialized = not_initialized;
+    }
+  }
+
+  void init(AddrUnsPair *adr2os, int num_addrs) {
+    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
+        &uninitialized, not_initialized, initializing);
+    if (bool_result == 0) { // Wait for initialization
+      while (TCR_1(uninitialized) != initialized)
+        KMP_CPU_PAUSE();
+      return;
+    }
+    KMP_DEBUG_ASSERT(bool_result == 1);
+
+    /* Added explicit initialization of the data fields here to prevent usage of
+       dirty value observed when static library is re-initialized multiple times
+       (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
+       OpenMP). */
+    depth = 1;
+    resizing = 0;
+    maxLevels = 7;
+    numPerLevel =
+        (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
+    skipPerLevel = &(numPerLevel[maxLevels]);
+    for (kmp_uint32 i = 0; i < maxLevels;
+         ++i) { // init numPerLevel[*] to 1 item per level
+      numPerLevel[i] = 1;
+      skipPerLevel[i] = 1;
+    }
+
+    // Sort table by physical ID
+    if (adr2os) {
+      qsort(adr2os, num_addrs, sizeof(*adr2os),
+            __kmp_affinity_cmp_Address_labels);
+      deriveLevels(adr2os, num_addrs);
+    } else {
+      numPerLevel[0] = maxLeaves;
+      numPerLevel[1] = num_addrs / maxLeaves;
+      if (num_addrs % maxLeaves)
+        numPerLevel[1]++;
+    }
+
+    base_num_threads = num_addrs;
+    for (int i = maxLevels - 1; i >= 0;
+         --i) // count non-empty levels to get depth
+      if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
+        depth++;
+
+    kmp_uint32 branch = minBranch;
+    if (numPerLevel[0] == 1)
+      branch = num_addrs / maxLeaves;
+    if (branch < minBranch)
+      branch = minBranch;
+    for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
+      while (numPerLevel[d] > branch ||
+             (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
+        if (numPerLevel[d] & 1)
+          numPerLevel[d]++;
+        numPerLevel[d] = numPerLevel[d] >> 1;
+        if (numPerLevel[d + 1] == 1)
+          depth++;
+        numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
+      }
+      if (numPerLevel[0] == 1) {
+        branch = branch >> 1;
+        if (branch < 4)
+          branch = minBranch;
+      }
+    }
+
+    for (kmp_uint32 i = 1; i < depth; ++i)
+      skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
+    // Fill in hierarchy in the case of oversubscription
+    for (kmp_uint32 i = depth; i < maxLevels; ++i)
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+
+    uninitialized = initialized; // One writer
+  }
+
+  // Resize the hierarchy if nproc changes to something larger than before
+  void resize(kmp_uint32 nproc) {
+    kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
+    while (bool_result == 0) { // someone else is trying to resize
+      KMP_CPU_PAUSE();
+      if (nproc <= base_num_threads) // happy with other thread's resize
+        return;
+      else // try to resize
+        bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
+    }
+    KMP_DEBUG_ASSERT(bool_result != 0);
+    if (nproc <= base_num_threads)
+      return; // happy with other thread's resize
+
+    // Calculate new maxLevels
+    kmp_uint32 old_sz = skipPerLevel[depth - 1];
+    kmp_uint32 incs = 0, old_maxLevels = maxLevels;
+    // First see if old maxLevels is enough to contain new size
+    for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+      numPerLevel[i - 1] *= 2;
+      old_sz *= 2;
+      depth++;
+    }
+    if (nproc > old_sz) { // Not enough space, need to expand hierarchy
+      while (nproc > old_sz) {
+        old_sz *= 2;
+        incs++;
+        depth++;
+      }
+      maxLevels += incs;
+
+      // Resize arrays
+      kmp_uint32 *old_numPerLevel = numPerLevel;
+      kmp_uint32 *old_skipPerLevel = skipPerLevel;
+      numPerLevel = skipPerLevel = NULL;
+      numPerLevel =
+          (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
+      skipPerLevel = &(numPerLevel[maxLevels]);
+
+      // Copy old elements from old arrays
+      for (kmp_uint32 i = 0; i < old_maxLevels;
+           ++i) { // init numPerLevel[*] to 1 item per level
+        numPerLevel[i] = old_numPerLevel[i];
+        skipPerLevel[i] = old_skipPerLevel[i];
+      }
+
+      // Init new elements in arrays to 1
+      for (kmp_uint32 i = old_maxLevels; i < maxLevels;
+           ++i) { // init numPerLevel[*] to 1 item per level
+        numPerLevel[i] = 1;
+        skipPerLevel[i] = 1;
+      }
+
+      // Free old arrays
+      __kmp_free(old_numPerLevel);
+    }
+
+    // Fill in oversubscription levels of hierarchy
+    for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
+      skipPerLevel[i] = 2 * skipPerLevel[i - 1];
+
+    base_num_threads = nproc;
+    resizing = 0; // One writer
+  }
+};
+#endif // KMP_AFFINITY_H
diff --git a/final/runtime/src/kmp_alloc.cpp b/final/runtime/src/kmp_alloc.cpp
new file mode 100644
index 0000000..8619401
--- /dev/null
+++ b/final/runtime/src/kmp_alloc.cpp
@@ -0,0 +1,2064 @@
+/*
+ * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_wrapper_malloc.h"
+
+// Disable bget when it is not used
+#if KMP_USE_BGET
+
+/* Thread private buffer management code */
+
+typedef int (*bget_compact_t)(size_t, int);
+typedef void *(*bget_acquire_t)(size_t);
+typedef void (*bget_release_t)(void *);
+
+/* NOTE: bufsize must be a signed datatype */
+
+#if KMP_OS_WINDOWS
+#if KMP_ARCH_X86 || KMP_ARCH_ARM
+typedef kmp_int32 bufsize;
+#else
+typedef kmp_int64 bufsize;
+#endif
+#else
+typedef ssize_t bufsize;
+#endif // KMP_OS_WINDOWS
+
+/* The three modes of operation are, fifo search, lifo search, and best-fit */
+
+typedef enum bget_mode {
+  bget_mode_fifo = 0,
+  bget_mode_lifo = 1,
+  bget_mode_best = 2
+} bget_mode_t;
+
+static void bpool(kmp_info_t *th, void *buffer, bufsize len);
+static void *bget(kmp_info_t *th, bufsize size);
+static void *bgetz(kmp_info_t *th, bufsize size);
+static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
+static void brel(kmp_info_t *th, void *buf);
+static void bectl(kmp_info_t *th, bget_compact_t compact,
+                  bget_acquire_t acquire, bget_release_t release,
+                  bufsize pool_incr);
+
+/* BGET CONFIGURATION */
+/* Buffer allocation size quantum: all buffers allocated are a
+   multiple of this size.  This MUST be a power of two. */
+
+/* On IA-32 architecture with  Linux* OS, malloc() does not
+   ensure 16 byte alignmnent */
+
+#if KMP_ARCH_X86 || !KMP_HAVE_QUAD
+
+#define SizeQuant 8
+#define AlignType double
+
+#else
+
+#define SizeQuant 16
+#define AlignType _Quad
+
+#endif
+
+// Define this symbol to enable the bstats() function which calculates the
+// total free space in the buffer pool, the largest available buffer, and the
+// total space currently allocated.
+#define BufStats 1
+
+#ifdef KMP_DEBUG
+
+// Define this symbol to enable the bpoold() function which dumps the buffers
+// in a buffer pool.
+#define BufDump 1
+
+// Define this symbol to enable the bpoolv() function for validating a buffer
+// pool.
+#define BufValid 1
+
+// Define this symbol to enable the bufdump() function which allows dumping the
+// contents of an allocated or free buffer.
+#define DumpData 1
+
+#ifdef NOT_USED_NOW
+
+// Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
+// who attempt to use pointers into released buffers.
+#define FreeWipe 1
+
+// Use a best fit algorithm when searching for space for an allocation request.
+// This uses memory more efficiently, but allocation will be much slower.
+#define BestFit 1
+
+#endif /* NOT_USED_NOW */
+#endif /* KMP_DEBUG */
+
+static bufsize bget_bin_size[] = {
+    0,
+    //    1 << 6,    /* .5 Cache line */
+    1 << 7, /* 1 Cache line, new */
+    1 << 8, /* 2 Cache lines */
+    1 << 9, /* 4 Cache lines, new */
+    1 << 10, /* 8 Cache lines */
+    1 << 11, /* 16 Cache lines, new */
+    1 << 12, 1 << 13, /* new */
+    1 << 14, 1 << 15, /* new */
+    1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
+    1 << 21, /*  2MB */
+    1 << 22, /*  4MB */
+    1 << 23, /*  8MB */
+    1 << 24, /* 16MB */
+    1 << 25, /* 32MB */
+};
+
+#define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
+
+struct bfhead;
+
+//  Declare the interface, including the requested buffer size type, bufsize.
+
+/* Queue links */
+typedef struct qlinks {
+  struct bfhead *flink; /* Forward link */
+  struct bfhead *blink; /* Backward link */
+} qlinks_t;
+
+/* Header in allocated and free buffers */
+typedef struct bhead2 {
+  kmp_info_t *bthr; /* The thread which owns the buffer pool */
+  bufsize prevfree; /* Relative link back to previous free buffer in memory or
+                       0 if previous buffer is allocated.  */
+  bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
+} bhead2_t;
+
+/* Make sure the bhead structure is a multiple of SizeQuant in size. */
+typedef union bhead {
+  KMP_ALIGN(SizeQuant)
+  AlignType b_align;
+  char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
+  bhead2_t bb;
+} bhead_t;
+#define BH(p) ((bhead_t *)(p))
+
+/*  Header in directly allocated buffers (by acqfcn) */
+typedef struct bdhead {
+  bufsize tsize; /* Total size, including overhead */
+  bhead_t bh; /* Common header */
+} bdhead_t;
+#define BDH(p) ((bdhead_t *)(p))
+
+/* Header in free buffers */
+typedef struct bfhead {
+  bhead_t bh; /* Common allocated/free header */
+  qlinks_t ql; /* Links on free list */
+} bfhead_t;
+#define BFH(p) ((bfhead_t *)(p))
+
+typedef struct thr_data {
+  bfhead_t freelist[MAX_BGET_BINS];
+#if BufStats
+  size_t totalloc; /* Total space currently allocated */
+  long numget, numrel; /* Number of bget() and brel() calls */
+  long numpblk; /* Number of pool blocks */
+  long numpget, numprel; /* Number of block gets and rels */
+  long numdget, numdrel; /* Number of direct gets and rels */
+#endif /* BufStats */
+
+  /* Automatic expansion block management functions */
+  bget_compact_t compfcn;
+  bget_acquire_t acqfcn;
+  bget_release_t relfcn;
+
+  bget_mode_t mode; /* what allocation mode to use? */
+
+  bufsize exp_incr; /* Expansion block size */
+  bufsize pool_len; /* 0: no bpool calls have been made
+                       -1: not all pool blocks are the same size
+                       >0: (common) block size for all bpool calls made so far
+                    */
+  bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */
+} thr_data_t;
+
+/*  Minimum allocation quantum: */
+#define QLSize (sizeof(qlinks_t))
+#define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
+#define MaxSize                                                                \
+  (bufsize)(                                                                   \
+      ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
+// Maximun for the requested size.
+
+/* End sentinel: value placed in bsize field of dummy block delimiting
+   end of pool block.  The most negative number which will  fit  in  a
+   bufsize, defined in a way that the compiler will accept. */
+
+#define ESent                                                                  \
+  ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
+
+/* Thread Data management routines */
+static int bget_get_bin(bufsize size) {
+  // binary chop bins
+  int lo = 0, hi = MAX_BGET_BINS - 1;
+
+  KMP_DEBUG_ASSERT(size > 0);
+
+  while ((hi - lo) > 1) {
+    int mid = (lo + hi) >> 1;
+    if (size < bget_bin_size[mid])
+      hi = mid - 1;
+    else
+      lo = mid;
+  }
+
+  KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
+
+  return lo;
+}
+
+static void set_thr_data(kmp_info_t *th) {
+  int i;
+  thr_data_t *data;
+
+  data = (thr_data_t *)((!th->th.th_local.bget_data)
+                            ? __kmp_allocate(sizeof(*data))
+                            : th->th.th_local.bget_data);
+
+  memset(data, '\0', sizeof(*data));
+
+  for (i = 0; i < MAX_BGET_BINS; ++i) {
+    data->freelist[i].ql.flink = &data->freelist[i];
+    data->freelist[i].ql.blink = &data->freelist[i];
+  }
+
+  th->th.th_local.bget_data = data;
+  th->th.th_local.bget_list = 0;
+#if !USE_CMP_XCHG_FOR_BGET
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_init_lock(&th->th.th_local.bget_lock);
+#else
+  __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif /* USE_LOCK_FOR_BGET */
+#endif /* ! USE_CMP_XCHG_FOR_BGET */
+}
+
+static thr_data_t *get_thr_data(kmp_info_t *th) {
+  thr_data_t *data;
+
+  data = (thr_data_t *)th->th.th_local.bget_data;
+
+  KMP_DEBUG_ASSERT(data != 0);
+
+  return data;
+}
+
+/* Walk the free list and release the enqueued buffers */
+static void __kmp_bget_dequeue(kmp_info_t *th) {
+  void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
+
+  if (p != 0) {
+#if USE_CMP_XCHG_FOR_BGET
+    {
+      volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+      while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
+                                        CCAST(void *, old_value), nullptr)) {
+        KMP_CPU_PAUSE();
+        old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
+      }
+      p = CCAST(void *, old_value);
+    }
+#else /* ! USE_CMP_XCHG_FOR_BGET */
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
+#else
+    __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif /* USE_QUEUING_LOCK_FOR_BGET */
+
+    p = (void *)th->th.th_local.bget_list;
+    th->th.th_local.bget_list = 0;
+
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+    __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
+#else
+    __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+#endif /* USE_CMP_XCHG_FOR_BGET */
+
+    /* Check again to make sure the list is not empty */
+    while (p != 0) {
+      void *buf = p;
+      bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
+
+      KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+      KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
+                       (kmp_uintptr_t)th); // clear possible mark
+      KMP_DEBUG_ASSERT(b->ql.blink == 0);
+
+      p = (void *)b->ql.flink;
+
+      brel(th, buf);
+    }
+  }
+}
+
+/* Chain together the free buffers by using the thread owner field */
+static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+                               ,
+                               kmp_int32 rel_gtid
+#endif
+                               ) {
+  bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
+
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
+  KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
+                   (kmp_uintptr_t)th); // clear possible mark
+
+  b->ql.blink = 0;
+
+  KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
+                __kmp_gtid_from_thread(th)));
+
+#if USE_CMP_XCHG_FOR_BGET
+  {
+    volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
+    /* the next pointer must be set before setting bget_list to buf to avoid
+       exposing a broken list to other threads, even for an instant. */
+    b->ql.flink = BFH(CCAST(void *, old_value));
+
+    while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
+                                      CCAST(void *, old_value), buf)) {
+      KMP_CPU_PAUSE();
+      old_value = TCR_PTR(th->th.th_local.bget_list);
+      /* the next pointer must be set before setting bget_list to buf to avoid
+         exposing a broken list to other threads, even for an instant. */
+      b->ql.flink = BFH(CCAST(void *, old_value));
+    }
+  }
+#else /* ! USE_CMP_XCHG_FOR_BGET */
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
+#else
+  __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+
+  b->ql.flink = BFH(th->th.th_local.bget_list);
+  th->th.th_local.bget_list = (void *)buf;
+
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+  __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
+#else
+  __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
+#endif
+#endif /* USE_CMP_XCHG_FOR_BGET */
+}
+
+/* insert buffer back onto a new freelist */
+static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
+  int bin;
+
+  KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
+
+  bin = bget_get_bin(b->bh.bb.bsize);
+
+  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
+                   &thr->freelist[bin]);
+  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
+                   &thr->freelist[bin]);
+
+  b->ql.flink = &thr->freelist[bin];
+  b->ql.blink = thr->freelist[bin].ql.blink;
+
+  thr->freelist[bin].ql.blink = b;
+  b->ql.blink->ql.flink = b;
+}
+
+/* unlink the buffer from the old freelist */
+static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
+  KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
+  KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
+
+  b->ql.blink->ql.flink = b->ql.flink;
+  b->ql.flink->ql.blink = b->ql.blink;
+}
+
+/*  GET STATS -- check info on free list */
+static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
+  thr_data_t *thr = get_thr_data(th);
+  int bin;
+
+  *total_free = *max_free = 0;
+
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b, *best;
+
+    best = &thr->freelist[bin];
+    b = best->ql.flink;
+
+    while (b != &thr->freelist[bin]) {
+      *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
+      if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
+        best = b;
+
+      /* Link to next buffer */
+      b = b->ql.flink;
+    }
+
+    if (*max_free < best->bh.bb.bsize)
+      *max_free = best->bh.bb.bsize;
+  }
+
+  if (*max_free > (bufsize)sizeof(bhead_t))
+    *max_free -= sizeof(bhead_t);
+}
+
+/*  BGET  --  Allocate a buffer.  */
+static void *bget(kmp_info_t *th, bufsize requested_size) {
+  thr_data_t *thr = get_thr_data(th);
+  bufsize size = requested_size;
+  bfhead_t *b;
+  void *buf;
+  int compactseq = 0;
+  int use_blink = 0;
+  /* For BestFit */
+  bfhead_t *best;
+
+  if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
+    return NULL;
+  }
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+  if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
+    size = SizeQ;
+  }
+#if defined(SizeQuant) && (SizeQuant > 1)
+  size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
+#endif
+
+  size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
+  KMP_DEBUG_ASSERT(size >= 0);
+  KMP_DEBUG_ASSERT(size % SizeQuant == 0);
+
+  use_blink = (thr->mode == bget_mode_lifo);
+
+  /* If a compact function was provided in the call to bectl(), wrap
+     a loop around the allocation process  to  allow  compaction  to
+     intervene in case we don't find a suitable buffer in the chain. */
+
+  for (;;) {
+    int bin;
+
+    for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
+      /* Link to next buffer */
+      b = (use_blink ? thr->freelist[bin].ql.blink
+                     : thr->freelist[bin].ql.flink);
+
+      if (thr->mode == bget_mode_best) {
+        best = &thr->freelist[bin];
+
+        /* Scan the free list searching for the first buffer big enough
+           to hold the requested size buffer. */
+        while (b != &thr->freelist[bin]) {
+          if (b->bh.bb.bsize >= (bufsize)size) {
+            if ((best == &thr->freelist[bin]) ||
+                (b->bh.bb.bsize < best->bh.bb.bsize)) {
+              best = b;
+            }
+          }
+
+          /* Link to next buffer */
+          b = (use_blink ? b->ql.blink : b->ql.flink);
+        }
+        b = best;
+      }
+
+      while (b != &thr->freelist[bin]) {
+        if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
+
+          // Buffer is big enough to satisfy the request. Allocate it to the
+          // caller. We must decide whether the buffer is large enough to split
+          // into the part given to the caller and a free buffer that remains
+          // on the free list, or whether the entire buffer should be removed
+          // from the free list and given to the caller in its entirety. We
+          // only split the buffer if enough room remains for a header plus the
+          // minimum quantum of allocation.
+          if ((b->bh.bb.bsize - (bufsize)size) >
+              (bufsize)(SizeQ + (sizeof(bhead_t)))) {
+            bhead_t *ba, *bn;
+
+            ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
+            bn = BH(((char *)ba) + size);
+
+            KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
+
+            /* Subtract size from length of free block. */
+            b->bh.bb.bsize -= (bufsize)size;
+
+            /* Link allocated buffer to the previous free buffer. */
+            ba->bb.prevfree = b->bh.bb.bsize;
+
+            /* Plug negative size into user buffer. */
+            ba->bb.bsize = -size;
+
+            /* Mark this buffer as owned by this thread. */
+            TCW_PTR(ba->bb.bthr,
+                    th); // not an allocated address (do not mark it)
+            /* Mark buffer after this one not preceded by free block. */
+            bn->bb.prevfree = 0;
+
+            // unlink buffer from old freelist, and reinsert into new freelist
+            __kmp_bget_remove_from_freelist(b);
+            __kmp_bget_insert_into_freelist(thr, b);
+#if BufStats
+            thr->totalloc += (size_t)size;
+            thr->numget++; /* Increment number of bget() calls */
+#endif
+            buf = (void *)((((char *)ba) + sizeof(bhead_t)));
+            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+            return buf;
+          } else {
+            bhead_t *ba;
+
+            ba = BH(((char *)b) + b->bh.bb.bsize);
+
+            KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
+
+            /* The buffer isn't big enough to split.  Give  the  whole
+               shebang to the caller and remove it from the free list. */
+
+            __kmp_bget_remove_from_freelist(b);
+#if BufStats
+            thr->totalloc += (size_t)b->bh.bb.bsize;
+            thr->numget++; /* Increment number of bget() calls */
+#endif
+            /* Negate size to mark buffer allocated. */
+            b->bh.bb.bsize = -(b->bh.bb.bsize);
+
+            /* Mark this buffer as owned by this thread. */
+            TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
+            /* Zero the back pointer in the next buffer in memory
+               to indicate that this buffer is allocated. */
+            ba->bb.prevfree = 0;
+
+            /* Give user buffer starting at queue links. */
+            buf = (void *)&(b->ql);
+            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+            return buf;
+          }
+        }
+
+        /* Link to next buffer */
+        b = (use_blink ? b->ql.blink : b->ql.flink);
+      }
+    }
+
+    /* We failed to find a buffer. If there's a compact function defined,
+       notify it of the size requested. If it returns TRUE, try the allocation
+       again. */
+
+    if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
+      break;
+    }
+  }
+
+  /* No buffer available with requested size free. */
+
+  /* Don't give up yet -- look in the reserve supply. */
+  if (thr->acqfcn != 0) {
+    if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
+      /* Request is too large to fit in a single expansion block.
+         Try to satisy it by a direct buffer acquisition. */
+      bdhead_t *bdh;
+
+      size += sizeof(bdhead_t) - sizeof(bhead_t);
+
+      KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
+
+      /* richryan */
+      bdh = BDH((*thr->acqfcn)((bufsize)size));
+      if (bdh != NULL) {
+
+        // Mark the buffer special by setting size field of its header to zero.
+        bdh->bh.bb.bsize = 0;
+
+        /* Mark this buffer as owned by this thread. */
+        TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
+        // because direct buffer never goes to free list
+        bdh->bh.bb.prevfree = 0;
+        bdh->tsize = size;
+#if BufStats
+        thr->totalloc += (size_t)size;
+        thr->numget++; /* Increment number of bget() calls */
+        thr->numdget++; /* Direct bget() call count */
+#endif
+        buf = (void *)(bdh + 1);
+        KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+        return buf;
+      }
+
+    } else {
+
+      /*  Try to obtain a new expansion block */
+      void *newpool;
+
+      KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
+
+      /* richryan */
+      newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
+      KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
+      if (newpool != NULL) {
+        bpool(th, newpool, thr->exp_incr);
+        buf = bget(
+            th, requested_size); /* This can't, I say, can't get into a loop. */
+        return buf;
+      }
+    }
+  }
+
+  /*  Still no buffer available */
+
+  return NULL;
+}
+
+/*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
+               the  entire  contents  of  the buffer to zero, not just the
+               region requested by the caller. */
+
+static void *bgetz(kmp_info_t *th, bufsize size) {
+  char *buf = (char *)bget(th, size);
+
+  if (buf != NULL) {
+    bhead_t *b;
+    bufsize rsize;
+
+    b = BH(buf - sizeof(bhead_t));
+    rsize = -(b->bb.bsize);
+    if (rsize == 0) {
+      bdhead_t *bd;
+
+      bd = BDH(buf - sizeof(bdhead_t));
+      rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
+    } else {
+      rsize -= sizeof(bhead_t);
+    }
+
+    KMP_DEBUG_ASSERT(rsize >= size);
+
+    (void)memset(buf, 0, (bufsize)rsize);
+  }
+  return ((void *)buf);
+}
+
+/*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
+               simply in terms of brel()  and  bget().   It  could  be
+               enhanced to allow the buffer to grow into adjacent free
+               blocks and to avoid moving data unnecessarily.  */
+
+static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
+  void *nbuf;
+  bufsize osize; /* Old size of buffer */
+  bhead_t *b;
+
+  nbuf = bget(th, size);
+  if (nbuf == NULL) { /* Acquire new buffer */
+    return NULL;
+  }
+  if (buf == NULL) {
+    return nbuf;
+  }
+  b = BH(((char *)buf) - sizeof(bhead_t));
+  osize = -b->bb.bsize;
+  if (osize == 0) {
+    /*  Buffer acquired directly through acqfcn. */
+    bdhead_t *bd;
+
+    bd = BDH(((char *)buf) - sizeof(bdhead_t));
+    osize = bd->tsize - (bufsize)sizeof(bdhead_t);
+  } else {
+    osize -= sizeof(bhead_t);
+  }
+
+  KMP_DEBUG_ASSERT(osize > 0);
+
+  (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
+                   (size_t)((size < osize) ? size : osize));
+  brel(th, buf);
+
+  return nbuf;
+}
+
+/*  BREL  --  Release a buffer.  */
+static void brel(kmp_info_t *th, void *buf) {
+  thr_data_t *thr = get_thr_data(th);
+  bfhead_t *b, *bn;
+  kmp_info_t *bth;
+
+  KMP_DEBUG_ASSERT(buf != NULL);
+  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
+
+  b = BFH(((char *)buf) - sizeof(bhead_t));
+
+  if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
+    bdhead_t *bdh;
+
+    bdh = BDH(((char *)buf) - sizeof(bdhead_t));
+    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+#if BufStats
+    thr->totalloc -= (size_t)bdh->tsize;
+    thr->numdrel++; /* Number of direct releases */
+    thr->numrel++; /* Increment number of brel() calls */
+#endif /* BufStats */
+#ifdef FreeWipe
+    (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
+#endif /* FreeWipe */
+
+    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
+
+    KMP_DEBUG_ASSERT(thr->relfcn != 0);
+    (*thr->relfcn)((void *)bdh); /* Release it directly. */
+    return;
+  }
+
+  bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
+                       ~1); // clear possible mark before comparison
+  if (bth != th) {
+    /* Add this buffer to be released by the owning thread later */
+    __kmp_bget_enqueue(bth, buf
+#ifdef USE_QUEUING_LOCK_FOR_BGET
+                       ,
+                       __kmp_gtid_from_thread(th)
+#endif
+                           );
+    return;
+  }
+
+  /* Buffer size must be negative, indicating that the buffer is allocated. */
+  if (b->bh.bb.bsize >= 0) {
+    bn = NULL;
+  }
+  KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
+
+  /*  Back pointer in next buffer must be zero, indicating the same thing: */
+
+  KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
+
+#if BufStats
+  thr->numrel++; /* Increment number of brel() calls */
+  thr->totalloc += (size_t)b->bh.bb.bsize;
+#endif
+
+  /* If the back link is nonzero, the previous buffer is free.  */
+
+  if (b->bh.bb.prevfree != 0) {
+    /* The previous buffer is free. Consolidate this buffer with it by adding
+       the length of this buffer to the previous free buffer. Note that we
+       subtract the size in the buffer being released, since it's negative to
+       indicate that the buffer is allocated. */
+    bufsize size = b->bh.bb.bsize;
+
+    /* Make the previous buffer the one we're working on. */
+    KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
+                     b->bh.bb.prevfree);
+    b = BFH(((char *)b) - b->bh.bb.prevfree);
+    b->bh.bb.bsize -= size;
+
+    /* unlink the buffer from the old freelist */
+    __kmp_bget_remove_from_freelist(b);
+  } else {
+    /* The previous buffer isn't allocated. Mark this buffer size as positive
+       (i.e. free) and fall through to place the buffer on the free list as an
+       isolated free block. */
+    b->bh.bb.bsize = -b->bh.bb.bsize;
+  }
+
+  /* insert buffer back onto a new freelist */
+  __kmp_bget_insert_into_freelist(thr, b);
+
+  /* Now we look at the next buffer in memory, located by advancing from
+     the  start  of  this  buffer  by its size, to see if that buffer is
+     free.  If it is, we combine  this  buffer  with  the  next  one  in
+     memory, dechaining the second buffer from the free list. */
+  bn = BFH(((char *)b) + b->bh.bb.bsize);
+  if (bn->bh.bb.bsize > 0) {
+
+    /* The buffer is free.  Remove it from the free list and add
+       its size to that of our buffer. */
+    KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
+                     bn->bh.bb.bsize);
+
+    __kmp_bget_remove_from_freelist(bn);
+
+    b->bh.bb.bsize += bn->bh.bb.bsize;
+
+    /* unlink the buffer from the old freelist, and reinsert it into the new
+     * freelist */
+    __kmp_bget_remove_from_freelist(b);
+    __kmp_bget_insert_into_freelist(thr, b);
+
+    /* Finally,  advance  to   the  buffer  that   follows  the  newly
+       consolidated free block.  We must set its  backpointer  to  the
+       head  of  the  consolidated free block.  We know the next block
+       must be an allocated block because the process of recombination
+       guarantees  that  two  free  blocks will never be contiguous in
+       memory.  */
+    bn = BFH(((char *)b) + b->bh.bb.bsize);
+  }
+#ifdef FreeWipe
+  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
+               (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
+#endif
+  KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
+
+  /* The next buffer is allocated.  Set the backpointer in it  to  point
+     to this buffer; the previous free buffer in memory. */
+
+  bn->bh.bb.prevfree = b->bh.bb.bsize;
+
+  /*  If  a  block-release function is defined, and this free buffer
+      constitutes the entire block, release it.  Note that  pool_len
+      is  defined  in  such a way that the test will fail unless all
+      pool blocks are the same size.  */
+  if (thr->relfcn != 0 &&
+      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
+#if BufStats
+    if (thr->numpblk !=
+        1) { /* Do not release the last buffer until finalization time */
+#endif
+
+      KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
+      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
+                       b->bh.bb.bsize);
+
+      /*  Unlink the buffer from the free list  */
+      __kmp_bget_remove_from_freelist(b);
+
+      KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
+
+      (*thr->relfcn)(b);
+#if BufStats
+      thr->numprel++; /* Nr of expansion block releases */
+      thr->numpblk--; /* Total number of blocks */
+      KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+
+      // avoid leaving stale last_pool pointer around if it is being dealloced
+      if (thr->last_pool == b)
+        thr->last_pool = 0;
+    } else {
+      thr->last_pool = b;
+    }
+#endif /* BufStats */
+  }
+}
+
+/*  BECTL  --  Establish automatic pool expansion control  */
+static void bectl(kmp_info_t *th, bget_compact_t compact,
+                  bget_acquire_t acquire, bget_release_t release,
+                  bufsize pool_incr) {
+  thr_data_t *thr = get_thr_data(th);
+
+  thr->compfcn = compact;
+  thr->acqfcn = acquire;
+  thr->relfcn = release;
+  thr->exp_incr = pool_incr;
+}
+
+/*  BPOOL  --  Add a region of memory to the buffer pool.  */
+static void bpool(kmp_info_t *th, void *buf, bufsize len) {
+  /*    int bin = 0; */
+  thr_data_t *thr = get_thr_data(th);
+  bfhead_t *b = BFH(buf);
+  bhead_t *bn;
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+#ifdef SizeQuant
+  len &= ~(SizeQuant - 1);
+#endif
+  if (thr->pool_len == 0) {
+    thr->pool_len = len;
+  } else if (len != thr->pool_len) {
+    thr->pool_len = -1;
+  }
+#if BufStats
+  thr->numpget++; /* Number of block acquisitions */
+  thr->numpblk++; /* Number of blocks total */
+  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+#endif /* BufStats */
+
+  /* Since the block is initially occupied by a single free  buffer,
+     it  had  better  not  be  (much) larger than the largest buffer
+     whose size we can store in bhead.bb.bsize. */
+  KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
+
+  /* Clear  the  backpointer at  the start of the block to indicate that
+     there  is  no  free  block  prior  to  this   one.    That   blocks
+     recombination when the first block in memory is released. */
+  b->bh.bb.prevfree = 0;
+
+  /* Create a dummy allocated buffer at the end of the pool.  This dummy
+     buffer is seen when a buffer at the end of the pool is released and
+     blocks  recombination  of  the last buffer with the dummy buffer at
+     the end.  The length in the dummy buffer  is  set  to  the  largest
+     negative  number  to  denote  the  end  of  the pool for diagnostic
+     routines (this specific value is  not  counted  on  by  the  actual
+     allocation and release functions). */
+  len -= sizeof(bhead_t);
+  b->bh.bb.bsize = (bufsize)len;
+  /* Set the owner of this buffer */
+  TCW_PTR(b->bh.bb.bthr,
+          (kmp_info_t *)((kmp_uintptr_t)th |
+                         1)); // mark the buffer as allocated address
+
+  /* Chain the new block to the free list. */
+  __kmp_bget_insert_into_freelist(thr, b);
+
+#ifdef FreeWipe
+  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
+               (size_t)(len - sizeof(bfhead_t)));
+#endif
+  bn = BH(((char *)b) + len);
+  bn->bb.prevfree = (bufsize)len;
+  /* Definition of ESent assumes two's complement! */
+  KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
+
+  bn->bb.bsize = ESent;
+}
+
+/*  BFREED  --  Dump the free lists for this thread. */
+static void bfreed(kmp_info_t *th) {
+  int bin = 0, count = 0;
+  int gtid = __kmp_gtid_from_thread(th);
+  thr_data_t *thr = get_thr_data(th);
+
+#if BufStats
+  __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
+                       " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
+                       " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
+                       " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
+                       " drel=%" KMP_INT64_SPEC "\n",
+                       gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
+                       (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
+                       (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
+                       (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
+#endif
+
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b;
+
+    for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
+         b = b->ql.flink) {
+      bufsize bs = b->bh.bb.bsize;
+
+      KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
+      KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
+      KMP_DEBUG_ASSERT(bs > 0);
+
+      count += 1;
+
+      __kmp_printf_no_lock(
+          "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
+          (long)bs);
+#ifdef FreeWipe
+      {
+        char *lerr = ((char *)b) + sizeof(bfhead_t);
+        if ((bs > sizeof(bfhead_t)) &&
+            ((*lerr != 0x55) ||
+             (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
+              0))) {
+          __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
+                               "free block have been overstored.)\n",
+                               gtid);
+        }
+      }
+#endif
+    }
+  }
+
+  if (count == 0)
+    __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
+}
+
+void __kmp_initialize_bget(kmp_info_t *th) {
+  KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
+
+  set_thr_data(th);
+
+  bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
+        (bufsize)__kmp_malloc_pool_incr);
+}
+
+void __kmp_finalize_bget(kmp_info_t *th) {
+  thr_data_t *thr;
+  bfhead_t *b;
+
+  KMP_DEBUG_ASSERT(th != 0);
+
+#if BufStats
+  thr = (thr_data_t *)th->th.th_local.bget_data;
+  KMP_DEBUG_ASSERT(thr != NULL);
+  b = thr->last_pool;
+
+  /*  If a block-release function is defined, and this free buffer constitutes
+      the entire block, release it. Note that pool_len is defined in such a way
+      that the test will fail unless all pool blocks are the same size.  */
+
+  // Deallocate the last pool if one exists because we no longer do it in brel()
+  if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
+      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
+    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
+    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
+    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
+                     b->bh.bb.bsize);
+
+    /*  Unlink the buffer from the free list  */
+    __kmp_bget_remove_from_freelist(b);
+
+    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
+
+    (*thr->relfcn)(b);
+    thr->numprel++; /* Nr of expansion block releases */
+    thr->numpblk--; /* Total number of blocks */
+    KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
+  }
+#endif /* BufStats */
+
+  /* Deallocate bget_data */
+  if (th->th.th_local.bget_data != NULL) {
+    __kmp_free(th->th.th_local.bget_data);
+    th->th.th_local.bget_data = NULL;
+  }
+}
+
+void kmpc_set_poolsize(size_t size) {
+  bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
+        (bget_release_t)free, (bufsize)size);
+}
+
+size_t kmpc_get_poolsize(void) {
+  thr_data_t *p;
+
+  p = get_thr_data(__kmp_get_thread());
+
+  return p->exp_incr;
+}
+
+void kmpc_set_poolmode(int mode) {
+  thr_data_t *p;
+
+  if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
+      mode == bget_mode_best) {
+    p = get_thr_data(__kmp_get_thread());
+    p->mode = (bget_mode_t)mode;
+  }
+}
+
+int kmpc_get_poolmode(void) {
+  thr_data_t *p;
+
+  p = get_thr_data(__kmp_get_thread());
+
+  return p->mode;
+}
+
+void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
+  kmp_info_t *th = __kmp_get_thread();
+  bufsize a, b;
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+  bcheck(th, &a, &b);
+
+  *maxmem = a;
+  *allmem = b;
+}
+
+void kmpc_poolprint(void) {
+  kmp_info_t *th = __kmp_get_thread();
+
+  __kmp_bget_dequeue(th); /* Release any queued buffers */
+
+  bfreed(th);
+}
+
+#endif // #if KMP_USE_BGET
+
+void *kmpc_malloc(size_t size) {
+  void *ptr;
+  ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
+  if (ptr != NULL) {
+    // save allocated pointer just before one returned to user
+    *(void **)ptr = ptr;
+    ptr = (void **)ptr + 1;
+  }
+  return ptr;
+}
+
+#define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
+
+void *kmpc_aligned_malloc(size_t size, size_t alignment) {
+  void *ptr;
+  void *ptr_allocated;
+  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
+  if (!IS_POWER_OF_TWO(alignment)) {
+    // AC: do we need to issue a warning here?
+    errno = EINVAL;
+    return NULL;
+  }
+  size = size + sizeof(void *) + alignment;
+  ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
+  if (ptr_allocated != NULL) {
+    // save allocated pointer just before one returned to user
+    ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
+                   ~(alignment - 1));
+    *((void **)ptr - 1) = ptr_allocated;
+  } else {
+    ptr = NULL;
+  }
+  return ptr;
+}
+
+void *kmpc_calloc(size_t nelem, size_t elsize) {
+  void *ptr;
+  ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
+  if (ptr != NULL) {
+    // save allocated pointer just before one returned to user
+    *(void **)ptr = ptr;
+    ptr = (void **)ptr + 1;
+  }
+  return ptr;
+}
+
+void *kmpc_realloc(void *ptr, size_t size) {
+  void *result = NULL;
+  if (ptr == NULL) {
+    // If pointer is NULL, realloc behaves like malloc.
+    result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
+    // save allocated pointer just before one returned to user
+    if (result != NULL) {
+      *(void **)result = result;
+      result = (void **)result + 1;
+    }
+  } else if (size == 0) {
+    // If size is 0, realloc behaves like free.
+    // The thread must be registered by the call to kmpc_malloc() or
+    // kmpc_calloc() before.
+    // So it should be safe to call __kmp_get_thread(), not
+    // __kmp_entry_thread().
+    KMP_ASSERT(*((void **)ptr - 1));
+    brel(__kmp_get_thread(), *((void **)ptr - 1));
+  } else {
+    result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
+                   (bufsize)(size + sizeof(ptr)));
+    if (result != NULL) {
+      *(void **)result = result;
+      result = (void **)result + 1;
+    }
+  }
+  return result;
+}
+
+// NOTE: the library must have already been initialized by a previous allocate
+void kmpc_free(void *ptr) {
+  if (!__kmp_init_serial) {
+    return;
+  }
+  if (ptr != NULL) {
+    kmp_info_t *th = __kmp_get_thread();
+    __kmp_bget_dequeue(th); /* Release any queued buffers */
+    // extract allocated pointer and free it
+    KMP_ASSERT(*((void **)ptr - 1));
+    brel(th, *((void **)ptr - 1));
+  }
+}
+
+void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = bget(th, (bufsize)size);
+  KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
+                           size_t elsize KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
+                (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
+  ptr = bgetz(th, (bufsize)(nelem * elsize));
+  KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
+                            size_t size KMP_SRC_LOC_DECL) {
+  KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
+                ptr, (int)size KMP_SRC_LOC_PARM));
+  ptr = bgetr(th, ptr, (bufsize)size);
+  KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
+  return ptr;
+}
+
+void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
+  KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
+                ptr KMP_SRC_LOC_PARM));
+  if (ptr != NULL) {
+    __kmp_bget_dequeue(th); /* Release any queued buffers */
+    brel(th, ptr);
+  }
+  KE_TRACE(30, ("<- __kmp_thread_free()\n"));
+}
+
+/* OMP 5.0 Memory Management support */
+static const char *kmp_mk_lib_name;
+static void *h_memkind;
+/* memkind experimental API: */
+// memkind_alloc
+static void *(*kmp_mk_alloc)(void *k, size_t sz);
+// memkind_free
+static void (*kmp_mk_free)(void *kind, void *ptr);
+// memkind_check_available
+static int (*kmp_mk_check)(void *kind);
+// kinds we are going to use
+static void **mk_default;
+static void **mk_interleave;
+static void **mk_hbw;
+static void **mk_hbw_interleave;
+static void **mk_hbw_preferred;
+static void **mk_hugetlb;
+static void **mk_hbw_hugetlb;
+static void **mk_hbw_preferred_hugetlb;
+
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
+static inline void chk_kind(void ***pkind) {
+  KMP_DEBUG_ASSERT(pkind);
+  if (*pkind) // symbol found
+    if (kmp_mk_check(**pkind)) // kind not available or error
+      *pkind = NULL;
+}
+#endif
+
+void __kmp_init_memkind() {
+// as of 2018-07-31 memkind does not support Windows*, exclude it for now
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
+  // use of statically linked memkind is problematic, as it depends on libnuma
+  kmp_mk_lib_name = "libmemkind.so";
+  h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);
+  if (h_memkind) {
+    kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");
+    kmp_mk_alloc =
+        (void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");
+    kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");
+    mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");
+    if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&
+        !kmp_mk_check(*mk_default)) {
+      __kmp_memkind_available = 1;
+      mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");
+      chk_kind(&mk_interleave);
+      mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");
+      chk_kind(&mk_hbw);
+      mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");
+      chk_kind(&mk_hbw_interleave);
+      mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");
+      chk_kind(&mk_hbw_preferred);
+      mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");
+      chk_kind(&mk_hugetlb);
+      mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");
+      chk_kind(&mk_hbw_hugetlb);
+      mk_hbw_preferred_hugetlb =
+          (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
+      chk_kind(&mk_hbw_preferred_hugetlb);
+      KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
+      return; // success
+    }
+    dlclose(h_memkind); // failure
+    h_memkind = NULL;
+  }
+  kmp_mk_check = NULL;
+  kmp_mk_alloc = NULL;
+  kmp_mk_free = NULL;
+  mk_default = NULL;
+  mk_interleave = NULL;
+  mk_hbw = NULL;
+  mk_hbw_interleave = NULL;
+  mk_hbw_preferred = NULL;
+  mk_hugetlb = NULL;
+  mk_hbw_hugetlb = NULL;
+  mk_hbw_preferred_hugetlb = NULL;
+#else
+  kmp_mk_lib_name = "";
+  h_memkind = NULL;
+  kmp_mk_check = NULL;
+  kmp_mk_alloc = NULL;
+  kmp_mk_free = NULL;
+  mk_default = NULL;
+  mk_interleave = NULL;
+  mk_hbw = NULL;
+  mk_hbw_interleave = NULL;
+  mk_hbw_preferred = NULL;
+  mk_hugetlb = NULL;
+  mk_hbw_hugetlb = NULL;
+  mk_hbw_preferred_hugetlb = NULL;
+#endif
+}
+
+void __kmp_fini_memkind() {
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
+  if (__kmp_memkind_available)
+    KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));
+  if (h_memkind) {
+    dlclose(h_memkind);
+    h_memkind = NULL;
+  }
+  kmp_mk_check = NULL;
+  kmp_mk_alloc = NULL;
+  kmp_mk_free = NULL;
+  mk_default = NULL;
+  mk_interleave = NULL;
+  mk_hbw = NULL;
+  mk_hbw_interleave = NULL;
+  mk_hbw_preferred = NULL;
+  mk_hugetlb = NULL;
+  mk_hbw_hugetlb = NULL;
+  mk_hbw_preferred_hugetlb = NULL;
+#endif
+}
+
+omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
+                                             int ntraits,
+                                             omp_alloctrait_t traits[]) {
+  // OpenMP 5.0 only allows predefined memspaces
+  KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space ||
+                   ms == omp_large_cap_mem_space || ms == omp_const_mem_space ||
+                   ms == omp_high_bw_mem_space);
+  kmp_allocator_t *al;
+  int i;
+  al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
+  al->memspace = ms; // not used currently
+  for (i = 0; i < ntraits; ++i) {
+    switch (traits[i].key) {
+    case OMP_ATK_THREADMODEL:
+    case OMP_ATK_ACCESS:
+    case OMP_ATK_PINNED:
+      break;
+    case OMP_ATK_ALIGNMENT:
+      al->alignment = traits[i].value;
+      KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
+      break;
+    case OMP_ATK_POOL_SIZE:
+      al->pool_size = traits[i].value;
+      break;
+    case OMP_ATK_FALLBACK:
+      al->fb = (omp_alloctrait_value_t)traits[i].value;
+      KMP_DEBUG_ASSERT(
+          al->fb == OMP_ATV_DEFAULT_MEM_FB || al->fb == OMP_ATV_NULL_FB ||
+          al->fb == OMP_ATV_ABORT_FB || al->fb == OMP_ATV_ALLOCATOR_FB);
+      break;
+    case OMP_ATK_FB_DATA:
+      al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
+      break;
+    case OMP_ATK_PARTITION:
+      al->memkind = RCAST(void **, traits[i].value);
+      break;
+    default:
+      KMP_ASSERT2(0, "Unexpected allocator trait");
+    }
+  }
+  if (al->fb == 0) {
+    // set default allocator
+    al->fb = OMP_ATV_DEFAULT_MEM_FB;
+    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
+  } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+    KMP_ASSERT(al->fb_data != NULL);
+  } else if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
+  }
+  if (__kmp_memkind_available) {
+    // Let's use memkind library if available
+    if (ms == omp_high_bw_mem_space) {
+      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_hbw_interleave) {
+        al->memkind = mk_hbw_interleave;
+      } else if (mk_hbw_preferred) {
+        // AC: do not try to use MEMKIND_HBW for now, because memkind library
+        // cannot reliably detect exhaustion of HBW memory.
+        // It could be possible using hbw_verify_memory_region() but memkind
+        // manual says: "Using this function in production code may result in
+        // serious performance penalty".
+        al->memkind = mk_hbw_preferred;
+      } else {
+        // HBW is requested but not available --> return NULL allocator
+        __kmp_free(al);
+        return omp_null_allocator;
+      }
+    } else {
+      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_interleave) {
+        al->memkind = mk_interleave;
+      } else {
+        al->memkind = mk_default;
+      }
+    }
+  } else {
+    if (ms == omp_high_bw_mem_space) {
+      // cannot detect HBW memory presence without memkind library
+      __kmp_free(al);
+      return omp_null_allocator;
+    }
+  }
+  return (omp_allocator_handle_t)al;
+}
+
+void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {
+  if (allocator > kmp_max_mem_alloc)
+    __kmp_free(allocator);
+}
+
+void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {
+  if (allocator == omp_null_allocator)
+    allocator = omp_default_mem_alloc;
+  __kmp_threads[gtid]->th.th_def_allocator = allocator;
+}
+
+omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
+  return __kmp_threads[gtid]->th.th_def_allocator;
+}
+
+typedef struct kmp_mem_desc { // Memory block descriptor
+  void *ptr_alloc; // Pointer returned by allocator
+  size_t size_a; // Size of allocated memory block (initial+descriptor+align)
+  void *ptr_align; // Pointer to aligned memory, returned
+  kmp_allocator_t *allocator; // allocator
+} kmp_mem_desc_t;
+static int alignment = sizeof(void *); // let's align to pointer size
+
+void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
+  void *ptr = NULL;
+  kmp_allocator_t *al;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (allocator == omp_null_allocator)
+    allocator = __kmp_threads[gtid]->th.th_def_allocator;
+
+  KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
+  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+
+  int sz_desc = sizeof(kmp_mem_desc_t);
+  kmp_mem_desc_t desc;
+  kmp_uintptr_t addr; // address returned by allocator
+  kmp_uintptr_t addr_align; // address to return to caller
+  kmp_uintptr_t addr_descr; // address of memory block descriptor
+  int align = alignment; // default alignment
+  if (allocator > kmp_max_mem_alloc && al->alignment > 0) {
+    align = al->alignment; // alignment requested by user
+  }
+  desc.size_a = size + sz_desc + align;
+
+  if (__kmp_memkind_available) {
+    if (allocator < kmp_max_mem_alloc) {
+      // pre-defined allocator
+      if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
+        ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
+      } else {
+        ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+      }
+    } else if (al->pool_size > 0) {
+      // custom allocator with pool size requested
+      kmp_uint64 used =
+          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
+      if (used + desc.size_a > al->pool_size) {
+        // not enough space, need to go fallback path
+        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+          al = (kmp_allocator_t *)omp_default_mem_alloc;
+          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+        } else if (al->fb == OMP_ATV_ABORT_FB) {
+          KMP_ASSERT(0); // abort fallback requested
+        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+          KMP_ASSERT(al != al->fb_data);
+          al = al->fb_data;
+          return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+        } // else ptr == NULL;
+      } else {
+        // pool has enough space
+        ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
+        if (ptr == NULL) {
+          if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+            al = (kmp_allocator_t *)omp_default_mem_alloc;
+            ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+          } else if (al->fb == OMP_ATV_ABORT_FB) {
+            KMP_ASSERT(0); // abort fallback requested
+          } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+            KMP_ASSERT(al != al->fb_data);
+            al = al->fb_data;
+            return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+          }
+        }
+      }
+    } else {
+      // custom allocator, pool size not requested
+      ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
+      if (ptr == NULL) {
+        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+          al = (kmp_allocator_t *)omp_default_mem_alloc;
+          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
+        } else if (al->fb == OMP_ATV_ABORT_FB) {
+          KMP_ASSERT(0); // abort fallback requested
+        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+          KMP_ASSERT(al != al->fb_data);
+          al = al->fb_data;
+          return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+        }
+      }
+    }
+  } else if (allocator < kmp_max_mem_alloc) {
+    // pre-defined allocator
+    if (allocator == omp_high_bw_mem_alloc) {
+      // ptr = NULL;
+    } else {
+      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+    }
+  } else if (al->pool_size > 0) {
+    // custom allocator with pool size requested
+    kmp_uint64 used =
+        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
+    if (used + desc.size_a > al->pool_size) {
+      // not enough space, need to go fallback path
+      KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+      if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
+        al = (kmp_allocator_t *)omp_default_mem_alloc;
+        ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+      } else if (al->fb == OMP_ATV_ABORT_FB) {
+        KMP_ASSERT(0); // abort fallback requested
+      } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
+        KMP_ASSERT(al != al->fb_data);
+        al = al->fb_data;
+        return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
+      } // else ptr == NULL;
+    } else {
+      // pool has enough space
+      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+      if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
+        KMP_ASSERT(0); // abort fallback requested
+      } // no sense to look for another fallback because of same internal alloc
+    }
+  } else {
+    // custom allocator, pool size not requested
+    ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
+    if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
+      KMP_ASSERT(0); // abort fallback requested
+    } // no sense to look for another fallback because of same internal alloc
+  }
+  KE_TRACE(10, ("__kmpc_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
+  if (ptr == NULL)
+    return NULL;
+
+  addr = (kmp_uintptr_t)ptr;
+  addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
+  addr_descr = addr_align - sz_desc;
+
+  desc.ptr_alloc = ptr;
+  desc.ptr_align = (void *)addr_align;
+  desc.allocator = al;
+  *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
+  KMP_MB();
+
+  KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", desc.ptr_align, gtid));
+  return desc.ptr_align;
+}
+
+void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
+  KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
+  if (ptr == NULL)
+    return;
+
+  kmp_allocator_t *al;
+  omp_allocator_handle_t oal;
+  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
+  kmp_mem_desc_t desc;
+  kmp_uintptr_t addr_align; // address to return to caller
+  kmp_uintptr_t addr_descr; // address of memory block descriptor
+
+  addr_align = (kmp_uintptr_t)ptr;
+  addr_descr = addr_align - sizeof(kmp_mem_desc_t);
+  desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
+
+  KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
+  if (allocator) {
+    KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);
+  }
+  al = desc.allocator;
+  oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
+  KMP_DEBUG_ASSERT(al);
+
+  if (__kmp_memkind_available) {
+    if (oal < kmp_max_mem_alloc) {
+      // pre-defined allocator
+      if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
+        kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
+      } else {
+        kmp_mk_free(*mk_default, desc.ptr_alloc);
+      }
+    } else {
+      if (al->pool_size > 0) { // custom allocator with pool size requested
+        kmp_uint64 used =
+            KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+        (void)used; // to suppress compiler warning
+        KMP_DEBUG_ASSERT(used >= desc.size_a);
+      }
+      kmp_mk_free(*al->memkind, desc.ptr_alloc);
+    }
+  } else {
+    if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
+      kmp_uint64 used =
+          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
+      (void)used; // to suppress compiler warning
+      KMP_DEBUG_ASSERT(used >= desc.size_a);
+    }
+    __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
+  }
+  KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, desc.ptr_alloc,
+                allocator));
+}
+
+/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
+   memory leaks, but it may be useful for debugging memory corruptions, used
+   freed pointers, etc. */
+/* #define LEAK_MEMORY */
+struct kmp_mem_descr { // Memory block descriptor.
+  void *ptr_allocated; // Pointer returned by malloc(), subject for free().
+  size_t size_allocated; // Size of allocated memory block.
+  void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
+  size_t size_aligned; // Size of aligned memory block.
+};
+typedef struct kmp_mem_descr kmp_mem_descr_t;
+
+/* Allocate memory on requested boundary, fill allocated memory with 0x00.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+static void *___kmp_allocate_align(size_t size,
+                                   size_t alignment KMP_SRC_LOC_DECL) {
+  /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
+     requested to return properly aligned pointer. Original pointer returned
+     by malloc() and size of allocated block is saved in descriptor just
+     before the aligned pointer. This information used by __kmp_free() -- it
+     has to pass to free() original pointer, not aligned one.
+
+          +---------+------------+-----------------------------------+---------+
+          | padding | descriptor |           aligned block           | padding |
+          +---------+------------+-----------------------------------+---------+
+          ^                      ^
+          |                      |
+          |                      +- Aligned pointer returned to caller
+          +- Pointer returned by malloc()
+
+      Aligned block is filled with zeros, paddings are filled with 0xEF. */
+
+  kmp_mem_descr_t descr;
+  kmp_uintptr_t addr_allocated; // Address returned by malloc().
+  kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
+  kmp_uintptr_t addr_descr; // Address of memory block descriptor.
+
+  KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
+                (int)size, (int)alignment KMP_SRC_LOC_PARM));
+
+  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
+  KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
+  // Make sure kmp_uintptr_t is enough to store addresses.
+
+  descr.size_aligned = size;
+  descr.size_allocated =
+      descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
+
+#if KMP_DEBUG
+  descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
+#else
+  descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
+#endif
+  KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
+                descr.ptr_allocated));
+  if (descr.ptr_allocated == NULL) {
+    KMP_FATAL(OutOfHeapMemory);
+  }
+
+  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
+  addr_aligned =
+      (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
+  addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
+
+  descr.ptr_aligned = (void *)addr_aligned;
+
+  KE_TRACE(26, ("   ___kmp_allocate_align: "
+                "ptr_allocated=%p, size_allocated=%d, "
+                "ptr_aligned=%p, size_aligned=%d\n",
+                descr.ptr_allocated, (int)descr.size_allocated,
+                descr.ptr_aligned, (int)descr.size_aligned));
+
+  KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
+  KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
+  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
+                   addr_allocated + descr.size_allocated);
+  KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
+#ifdef KMP_DEBUG
+  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
+// Fill allocated memory block with 0xEF.
+#endif
+  memset(descr.ptr_aligned, 0x00, descr.size_aligned);
+  // Fill the aligned memory block (which is intended for using by caller) with
+  // 0x00. Do not
+  // put this filling under KMP_DEBUG condition! Many callers expect zeroed
+  // memory. (Padding
+  // bytes remain filled with 0xEF in debugging library.)
+  *((kmp_mem_descr_t *)addr_descr) = descr;
+
+  KMP_MB();
+
+  KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
+  return descr.ptr_aligned;
+} // func ___kmp_allocate_align
+
+/* Allocate memory on cache line boundary, fill allocated memory with 0x00.
+   Do not call this func directly! Use __kmp_allocate macro instead.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
+  KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
+  return ptr;
+} // func ___kmp_allocate
+
+/* Allocate memory on page boundary, fill allocated memory with 0x00.
+   Does not call this func directly! Use __kmp_page_allocate macro instead.
+   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
+   error. Must use __kmp_free when freeing memory allocated by this routine! */
+void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
+  int page_size = 8 * 1024;
+  void *ptr;
+
+  KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
+                (int)size KMP_SRC_LOC_PARM));
+  ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
+  KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
+  return ptr;
+} // ___kmp_page_allocate
+
+/* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
+   In debug mode, fill the memory block with 0xEF before call to free(). */
+void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
+  kmp_mem_descr_t descr;
+  kmp_uintptr_t addr_allocated; // Address returned by malloc().
+  kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
+
+  KE_TRACE(25,
+           ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
+  KMP_ASSERT(ptr != NULL);
+
+  descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
+
+  KE_TRACE(26, ("   __kmp_free:     "
+                "ptr_allocated=%p, size_allocated=%d, "
+                "ptr_aligned=%p, size_aligned=%d\n",
+                descr.ptr_allocated, (int)descr.size_allocated,
+                descr.ptr_aligned, (int)descr.size_aligned));
+
+  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
+  addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
+
+  KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
+  KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
+  KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
+  KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
+  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
+                   addr_allocated + descr.size_allocated);
+
+#ifdef KMP_DEBUG
+  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
+// Fill memory block with 0xEF, it helps catch using freed memory.
+#endif
+
+#ifndef LEAK_MEMORY
+  KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
+#ifdef KMP_DEBUG
+  _free_src_loc(descr.ptr_allocated, _file_, _line_);
+#else
+  free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
+#endif
+#endif
+  KMP_MB();
+  KE_TRACE(25, ("<- __kmp_free() returns\n"));
+} // func ___kmp_free
+
+#if USE_FAST_MEMORY == 3
+// Allocate fast memory by first scanning the thread's free lists
+// If a chunk the right size exists, grab it off the free list.
+// Otherwise allocate normally using kmp_thread_malloc.
+
+// AC: How to choose the limit? Just get 16 for now...
+#define KMP_FREE_LIST_LIMIT 16
+
+// Always use 128 bytes for determining buckets for caching memory blocks
+#define DCACHE_LINE 128
+
+void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
+  void *ptr;
+  int num_lines;
+  int idx;
+  int index;
+  void *alloc_ptr;
+  size_t alloc_size;
+  kmp_mem_descr_t *descr;
+
+  KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
+                __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
+
+  num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
+  idx = num_lines - 1;
+  KMP_DEBUG_ASSERT(idx >= 0);
+  if (idx < 2) {
+    index = 0; // idx is [ 0, 1 ], use first free list
+    num_lines = 2; // 1, 2 cache lines or less than cache line
+  } else if ((idx >>= 2) == 0) {
+    index = 1; // idx is [ 2, 3 ], use second free list
+    num_lines = 4; // 3, 4 cache lines
+  } else if ((idx >>= 2) == 0) {
+    index = 2; // idx is [ 4, 15 ], use third free list
+    num_lines = 16; // 5, 6, ..., 16 cache lines
+  } else if ((idx >>= 2) == 0) {
+    index = 3; // idx is [ 16, 63 ], use fourth free list
+    num_lines = 64; // 17, 18, ..., 64 cache lines
+  } else {
+    goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
+  }
+
+  ptr = this_thr->th.th_free_lists[index].th_free_list_self;
+  if (ptr != NULL) {
+    // pop the head of no-sync free list
+    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+    KMP_DEBUG_ASSERT(
+        this_thr ==
+        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
+            ->ptr_aligned);
+    goto end;
+  }
+  ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
+  if (ptr != NULL) {
+    // no-sync free list is empty, use sync free list (filled in by other
+    // threads only)
+    // pop the head of the sync free list, push NULL instead
+    while (!KMP_COMPARE_AND_STORE_PTR(
+        &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
+      KMP_CPU_PAUSE();
+      ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
+    }
+    // push the rest of chain into no-sync free list (can be NULL if there was
+    // the only block)
+    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
+    KMP_DEBUG_ASSERT(
+        this_thr ==
+        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
+            ->ptr_aligned);
+    goto end;
+  }
+
+alloc_call:
+  // haven't found block in the free lists, thus allocate it
+  size = num_lines * DCACHE_LINE;
+
+  alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
+  KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
+                "alloc_size %d\n",
+                __kmp_gtid_from_thread(this_thr), alloc_size));
+  alloc_ptr = bget(this_thr, (bufsize)alloc_size);
+
+  // align ptr to DCACHE_LINE
+  ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
+                  DCACHE_LINE) &
+                 ~(DCACHE_LINE - 1));
+  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
+
+  descr->ptr_allocated = alloc_ptr; // remember allocated pointer
+  // we don't need size_allocated
+  descr->ptr_aligned = (void *)this_thr; // remember allocating thread
+  // (it is already saved in bget buffer,
+  // but we may want to use another allocator in future)
+  descr->size_aligned = size;
+
+end:
+  KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
+                __kmp_gtid_from_thread(this_thr), ptr));
+  return ptr;
+} // func __kmp_fast_allocate
+
+// Free fast memory and place it on the thread's free list if it is of
+// the correct size.
+void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
+  kmp_mem_descr_t *descr;
+  kmp_info_t *alloc_thr;
+  size_t size;
+  size_t idx;
+  int index;
+
+  KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
+                __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
+  KMP_ASSERT(ptr != NULL);
+
+  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
+
+  KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
+                (int)descr->size_aligned));
+
+  size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
+
+  idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
+  if (idx == size) {
+    index = 0; // 2 cache lines
+  } else if ((idx <<= 1) == size) {
+    index = 1; // 4 cache lines
+  } else if ((idx <<= 2) == size) {
+    index = 2; // 16 cache lines
+  } else if ((idx <<= 2) == size) {
+    index = 3; // 64 cache lines
+  } else {
+    KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
+    goto free_call; // 65 or more cache lines ( > 8KB )
+  }
+
+  alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
+  if (alloc_thr == this_thr) {
+    // push block to self no-sync free list, linking previous head (LIFO)
+    *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
+    this_thr->th.th_free_lists[index].th_free_list_self = ptr;
+  } else {
+    void *head = this_thr->th.th_free_lists[index].th_free_list_other;
+    if (head == NULL) {
+      // Create new free list
+      this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+      *((void **)ptr) = NULL; // mark the tail of the list
+      descr->size_allocated = (size_t)1; // head of the list keeps its length
+    } else {
+      // need to check existed "other" list's owner thread and size of queue
+      kmp_mem_descr_t *dsc =
+          (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
+      // allocating thread, same for all queue nodes
+      kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
+      size_t q_sz =
+          dsc->size_allocated + 1; // new size in case we add current task
+      if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
+        // we can add current task to "other" list, no sync needed
+        *((void **)ptr) = head;
+        descr->size_allocated = q_sz;
+        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+      } else {
+        // either queue blocks owner is changing or size limit exceeded
+        // return old queue to allocating thread (q_th) synchroneously,
+        // and start new list for alloc_thr's tasks
+        void *old_ptr;
+        void *tail = head;
+        void *next = *((void **)head);
+        while (next != NULL) {
+          KMP_DEBUG_ASSERT(
+              // queue size should decrease by 1 each step through the list
+              ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
+                      ->size_allocated +
+                  1 ==
+              ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
+                  ->size_allocated);
+          tail = next; // remember tail node
+          next = *((void **)next);
+        }
+        KMP_DEBUG_ASSERT(q_th != NULL);
+        // push block to owner's sync free list
+        old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
+        /* the next pointer must be set before setting free_list to ptr to avoid
+           exposing a broken list to other threads, even for an instant. */
+        *((void **)tail) = old_ptr;
+
+        while (!KMP_COMPARE_AND_STORE_PTR(
+            &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
+          KMP_CPU_PAUSE();
+          old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
+          *((void **)tail) = old_ptr;
+        }
+
+        // start new list of not-selt tasks
+        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
+        *((void **)ptr) = NULL;
+        descr->size_allocated = (size_t)1; // head of queue keeps its length
+      }
+    }
+  }
+  goto end;
+
+free_call:
+  KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
+                __kmp_gtid_from_thread(this_thr), size));
+  __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
+  brel(this_thr, descr->ptr_allocated);
+
+end:
+  KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
+
+} // func __kmp_fast_free
+
+// Initialize the thread free lists related to fast memory
+// Only do this when a thread is initially created.
+void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
+  KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
+
+  memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
+}
+
+// Free the memory in the thread free lists related to fast memory
+// Only do this when a thread is being reaped (destroyed).
+void __kmp_free_fast_memory(kmp_info_t *th) {
+  // Suppose we use BGET underlying allocator, walk through its structures...
+  int bin;
+  thr_data_t *thr = get_thr_data(th);
+  void **lst = NULL;
+
+  KE_TRACE(
+      5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
+
+  __kmp_bget_dequeue(th); // Release any queued buffers
+
+  // Dig through free lists and extract all allocated blocks
+  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
+    bfhead_t *b = thr->freelist[bin].ql.flink;
+    while (b != &thr->freelist[bin]) {
+      if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
+        *((void **)b) =
+            lst; // link the list (override bthr, but keep flink yet)
+        lst = (void **)b; // push b into lst
+      }
+      b = b->ql.flink; // get next buffer
+    }
+  }
+  while (lst != NULL) {
+    void *next = *lst;
+    KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
+                  lst, next, th, __kmp_gtid_from_thread(th)));
+    (*thr->relfcn)(lst);
+#if BufStats
+    // count blocks to prevent problems in __kmp_finalize_bget()
+    thr->numprel++; /* Nr of expansion block releases */
+    thr->numpblk--; /* Total number of blocks */
+#endif
+    lst = (void **)next;
+  }
+
+  KE_TRACE(
+      5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
+}
+
+#endif // USE_FAST_MEMORY
diff --git a/final/runtime/src/kmp_atomic.cpp b/final/runtime/src/kmp_atomic.cpp
new file mode 100644
index 0000000..f1ee3d2
--- /dev/null
+++ b/final/runtime/src/kmp_atomic.cpp
@@ -0,0 +1,3623 @@
+/*
+ * kmp_atomic.cpp -- ATOMIC implementation routines
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_atomic.h"
+#include "kmp.h" // TRUE, asm routines prototypes
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+/*!
+@defgroup ATOMIC_OPS Atomic Operations
+These functions are used for implementing the many different varieties of atomic
+operations.
+
+The compiler is at liberty to inline atomic operations that are naturally
+supported by the target architecture. For instance on IA-32 architecture an
+atomic like this can be inlined
+@code
+static int s = 0;
+#pragma omp atomic
+    s++;
+@endcode
+using the single instruction: `lock; incl s`
+
+However the runtime does provide entrypoints for these operations to support
+compilers that choose not to inline them. (For instance,
+`__kmpc_atomic_fixed4_add` could be used to perform the increment above.)
+
+The names of the functions are encoded by using the data type name and the
+operation name, as in these tables.
+
+Data Type  | Data type encoding
+-----------|---------------
+int8_t     | `fixed1`
+uint8_t    | `fixed1u`
+int16_t    | `fixed2`
+uint16_t   | `fixed2u`
+int32_t    | `fixed4`
+uint32_t   | `fixed4u`
+int32_t    | `fixed8`
+uint32_t   | `fixed8u`
+float      | `float4`
+double     | `float8`
+float 10 (8087 eighty bit float)  | `float10`
+complex<float>   |  `cmplx4`
+complex<double>  | `cmplx8`
+complex<float10> | `cmplx10`
+<br>
+
+Operation | Operation encoding
+----------|-------------------
++ | add
+- | sub
+\* | mul
+/ | div
+& | andb
+<< | shl
+\>\> | shr
+\| | orb
+^  | xor
+&& | andl
+\|\| | orl
+maximum | max
+minimum | min
+.eqv.   | eqv
+.neqv.  | neqv
+
+<br>
+For non-commutative operations, `_rev` can also be added for the reversed
+operation. For the functions that capture the result, the suffix `_cpt` is
+added.
+
+Update Functions
+================
+The general form of an atomic function that just performs an update (without a
+`capture`)
+@code
+void __kmpc_atomic_<datatype>_<operation>( ident_t *id_ref, int gtid, TYPE *
+lhs, TYPE rhs );
+@endcode
+@param ident_t  a pointer to source location
+@param gtid  the global thread id
+@param lhs   a pointer to the left operand
+@param rhs   the right operand
+
+`capture` functions
+===================
+The capture functions perform an atomic update and return a result, which is
+either the value before the capture, or that after. They take an additional
+argument to determine which result is returned.
+Their general form is therefore
+@code
+TYPE __kmpc_atomic_<datatype>_<operation>_cpt( ident_t *id_ref, int gtid, TYPE *
+lhs, TYPE rhs, int flag );
+@endcode
+@param ident_t  a pointer to source location
+@param gtid  the global thread id
+@param lhs   a pointer to the left operand
+@param rhs   the right operand
+@param flag  one if the result is to be captured *after* the operation, zero if
+captured *before*.
+
+The one set of exceptions to this is the `complex<float>` type where the value
+is not returned, rather an extra argument pointer is passed.
+
+They look like
+@code
+void __kmpc_atomic_cmplx4_<op>_cpt(  ident_t *id_ref, int gtid, kmp_cmplx32 *
+lhs, kmp_cmplx32 rhs, kmp_cmplx32 * out, int flag );
+@endcode
+
+Read and Write Operations
+=========================
+The OpenMP<sup>*</sup> standard now supports atomic operations that simply
+ensure that the value is read or written atomically, with no modification
+performed. In many cases on IA-32 architecture these operations can be inlined
+since the architecture guarantees that no tearing occurs on aligned objects
+accessed with a single memory operation of up to 64 bits in size.
+
+The general form of the read operations is
+@code
+TYPE __kmpc_atomic_<type>_rd ( ident_t *id_ref, int gtid, TYPE * loc );
+@endcode
+
+For the write operations the form is
+@code
+void __kmpc_atomic_<type>_wr ( ident_t *id_ref, int gtid, TYPE * lhs, TYPE rhs
+);
+@endcode
+
+Full list of functions
+======================
+This leads to the generation of 376 atomic functions, as follows.
+
+Functons for integers
+---------------------
+There are versions here for integers of size 1,2,4 and 8 bytes both signed and
+unsigned (where that matters).
+@code
+    __kmpc_atomic_fixed1_add
+    __kmpc_atomic_fixed1_add_cpt
+    __kmpc_atomic_fixed1_add_fp
+    __kmpc_atomic_fixed1_andb
+    __kmpc_atomic_fixed1_andb_cpt
+    __kmpc_atomic_fixed1_andl
+    __kmpc_atomic_fixed1_andl_cpt
+    __kmpc_atomic_fixed1_div
+    __kmpc_atomic_fixed1_div_cpt
+    __kmpc_atomic_fixed1_div_cpt_rev
+    __kmpc_atomic_fixed1_div_float8
+    __kmpc_atomic_fixed1_div_fp
+    __kmpc_atomic_fixed1_div_rev
+    __kmpc_atomic_fixed1_eqv
+    __kmpc_atomic_fixed1_eqv_cpt
+    __kmpc_atomic_fixed1_max
+    __kmpc_atomic_fixed1_max_cpt
+    __kmpc_atomic_fixed1_min
+    __kmpc_atomic_fixed1_min_cpt
+    __kmpc_atomic_fixed1_mul
+    __kmpc_atomic_fixed1_mul_cpt
+    __kmpc_atomic_fixed1_mul_float8
+    __kmpc_atomic_fixed1_mul_fp
+    __kmpc_atomic_fixed1_neqv
+    __kmpc_atomic_fixed1_neqv_cpt
+    __kmpc_atomic_fixed1_orb
+    __kmpc_atomic_fixed1_orb_cpt
+    __kmpc_atomic_fixed1_orl
+    __kmpc_atomic_fixed1_orl_cpt
+    __kmpc_atomic_fixed1_rd
+    __kmpc_atomic_fixed1_shl
+    __kmpc_atomic_fixed1_shl_cpt
+    __kmpc_atomic_fixed1_shl_cpt_rev
+    __kmpc_atomic_fixed1_shl_rev
+    __kmpc_atomic_fixed1_shr
+    __kmpc_atomic_fixed1_shr_cpt
+    __kmpc_atomic_fixed1_shr_cpt_rev
+    __kmpc_atomic_fixed1_shr_rev
+    __kmpc_atomic_fixed1_sub
+    __kmpc_atomic_fixed1_sub_cpt
+    __kmpc_atomic_fixed1_sub_cpt_rev
+    __kmpc_atomic_fixed1_sub_fp
+    __kmpc_atomic_fixed1_sub_rev
+    __kmpc_atomic_fixed1_swp
+    __kmpc_atomic_fixed1_wr
+    __kmpc_atomic_fixed1_xor
+    __kmpc_atomic_fixed1_xor_cpt
+    __kmpc_atomic_fixed1u_add_fp
+    __kmpc_atomic_fixed1u_sub_fp
+    __kmpc_atomic_fixed1u_mul_fp
+    __kmpc_atomic_fixed1u_div
+    __kmpc_atomic_fixed1u_div_cpt
+    __kmpc_atomic_fixed1u_div_cpt_rev
+    __kmpc_atomic_fixed1u_div_fp
+    __kmpc_atomic_fixed1u_div_rev
+    __kmpc_atomic_fixed1u_shr
+    __kmpc_atomic_fixed1u_shr_cpt
+    __kmpc_atomic_fixed1u_shr_cpt_rev
+    __kmpc_atomic_fixed1u_shr_rev
+    __kmpc_atomic_fixed2_add
+    __kmpc_atomic_fixed2_add_cpt
+    __kmpc_atomic_fixed2_add_fp
+    __kmpc_atomic_fixed2_andb
+    __kmpc_atomic_fixed2_andb_cpt
+    __kmpc_atomic_fixed2_andl
+    __kmpc_atomic_fixed2_andl_cpt
+    __kmpc_atomic_fixed2_div
+    __kmpc_atomic_fixed2_div_cpt
+    __kmpc_atomic_fixed2_div_cpt_rev
+    __kmpc_atomic_fixed2_div_float8
+    __kmpc_atomic_fixed2_div_fp
+    __kmpc_atomic_fixed2_div_rev
+    __kmpc_atomic_fixed2_eqv
+    __kmpc_atomic_fixed2_eqv_cpt
+    __kmpc_atomic_fixed2_max
+    __kmpc_atomic_fixed2_max_cpt
+    __kmpc_atomic_fixed2_min
+    __kmpc_atomic_fixed2_min_cpt
+    __kmpc_atomic_fixed2_mul
+    __kmpc_atomic_fixed2_mul_cpt
+    __kmpc_atomic_fixed2_mul_float8
+    __kmpc_atomic_fixed2_mul_fp
+    __kmpc_atomic_fixed2_neqv
+    __kmpc_atomic_fixed2_neqv_cpt
+    __kmpc_atomic_fixed2_orb
+    __kmpc_atomic_fixed2_orb_cpt
+    __kmpc_atomic_fixed2_orl
+    __kmpc_atomic_fixed2_orl_cpt
+    __kmpc_atomic_fixed2_rd
+    __kmpc_atomic_fixed2_shl
+    __kmpc_atomic_fixed2_shl_cpt
+    __kmpc_atomic_fixed2_shl_cpt_rev
+    __kmpc_atomic_fixed2_shl_rev
+    __kmpc_atomic_fixed2_shr
+    __kmpc_atomic_fixed2_shr_cpt
+    __kmpc_atomic_fixed2_shr_cpt_rev
+    __kmpc_atomic_fixed2_shr_rev
+    __kmpc_atomic_fixed2_sub
+    __kmpc_atomic_fixed2_sub_cpt
+    __kmpc_atomic_fixed2_sub_cpt_rev
+    __kmpc_atomic_fixed2_sub_fp
+    __kmpc_atomic_fixed2_sub_rev
+    __kmpc_atomic_fixed2_swp
+    __kmpc_atomic_fixed2_wr
+    __kmpc_atomic_fixed2_xor
+    __kmpc_atomic_fixed2_xor_cpt
+    __kmpc_atomic_fixed2u_add_fp
+    __kmpc_atomic_fixed2u_sub_fp
+    __kmpc_atomic_fixed2u_mul_fp
+    __kmpc_atomic_fixed2u_div
+    __kmpc_atomic_fixed2u_div_cpt
+    __kmpc_atomic_fixed2u_div_cpt_rev
+    __kmpc_atomic_fixed2u_div_fp
+    __kmpc_atomic_fixed2u_div_rev
+    __kmpc_atomic_fixed2u_shr
+    __kmpc_atomic_fixed2u_shr_cpt
+    __kmpc_atomic_fixed2u_shr_cpt_rev
+    __kmpc_atomic_fixed2u_shr_rev
+    __kmpc_atomic_fixed4_add
+    __kmpc_atomic_fixed4_add_cpt
+    __kmpc_atomic_fixed4_add_fp
+    __kmpc_atomic_fixed4_andb
+    __kmpc_atomic_fixed4_andb_cpt
+    __kmpc_atomic_fixed4_andl
+    __kmpc_atomic_fixed4_andl_cpt
+    __kmpc_atomic_fixed4_div
+    __kmpc_atomic_fixed4_div_cpt
+    __kmpc_atomic_fixed4_div_cpt_rev
+    __kmpc_atomic_fixed4_div_float8
+    __kmpc_atomic_fixed4_div_fp
+    __kmpc_atomic_fixed4_div_rev
+    __kmpc_atomic_fixed4_eqv
+    __kmpc_atomic_fixed4_eqv_cpt
+    __kmpc_atomic_fixed4_max
+    __kmpc_atomic_fixed4_max_cpt
+    __kmpc_atomic_fixed4_min
+    __kmpc_atomic_fixed4_min_cpt
+    __kmpc_atomic_fixed4_mul
+    __kmpc_atomic_fixed4_mul_cpt
+    __kmpc_atomic_fixed4_mul_float8
+    __kmpc_atomic_fixed4_mul_fp
+    __kmpc_atomic_fixed4_neqv
+    __kmpc_atomic_fixed4_neqv_cpt
+    __kmpc_atomic_fixed4_orb
+    __kmpc_atomic_fixed4_orb_cpt
+    __kmpc_atomic_fixed4_orl
+    __kmpc_atomic_fixed4_orl_cpt
+    __kmpc_atomic_fixed4_rd
+    __kmpc_atomic_fixed4_shl
+    __kmpc_atomic_fixed4_shl_cpt
+    __kmpc_atomic_fixed4_shl_cpt_rev
+    __kmpc_atomic_fixed4_shl_rev
+    __kmpc_atomic_fixed4_shr
+    __kmpc_atomic_fixed4_shr_cpt
+    __kmpc_atomic_fixed4_shr_cpt_rev
+    __kmpc_atomic_fixed4_shr_rev
+    __kmpc_atomic_fixed4_sub
+    __kmpc_atomic_fixed4_sub_cpt
+    __kmpc_atomic_fixed4_sub_cpt_rev
+    __kmpc_atomic_fixed4_sub_fp
+    __kmpc_atomic_fixed4_sub_rev
+    __kmpc_atomic_fixed4_swp
+    __kmpc_atomic_fixed4_wr
+    __kmpc_atomic_fixed4_xor
+    __kmpc_atomic_fixed4_xor_cpt
+    __kmpc_atomic_fixed4u_add_fp
+    __kmpc_atomic_fixed4u_sub_fp
+    __kmpc_atomic_fixed4u_mul_fp
+    __kmpc_atomic_fixed4u_div
+    __kmpc_atomic_fixed4u_div_cpt
+    __kmpc_atomic_fixed4u_div_cpt_rev
+    __kmpc_atomic_fixed4u_div_fp
+    __kmpc_atomic_fixed4u_div_rev
+    __kmpc_atomic_fixed4u_shr
+    __kmpc_atomic_fixed4u_shr_cpt
+    __kmpc_atomic_fixed4u_shr_cpt_rev
+    __kmpc_atomic_fixed4u_shr_rev
+    __kmpc_atomic_fixed8_add
+    __kmpc_atomic_fixed8_add_cpt
+    __kmpc_atomic_fixed8_add_fp
+    __kmpc_atomic_fixed8_andb
+    __kmpc_atomic_fixed8_andb_cpt
+    __kmpc_atomic_fixed8_andl
+    __kmpc_atomic_fixed8_andl_cpt
+    __kmpc_atomic_fixed8_div
+    __kmpc_atomic_fixed8_div_cpt
+    __kmpc_atomic_fixed8_div_cpt_rev
+    __kmpc_atomic_fixed8_div_float8
+    __kmpc_atomic_fixed8_div_fp
+    __kmpc_atomic_fixed8_div_rev
+    __kmpc_atomic_fixed8_eqv
+    __kmpc_atomic_fixed8_eqv_cpt
+    __kmpc_atomic_fixed8_max
+    __kmpc_atomic_fixed8_max_cpt
+    __kmpc_atomic_fixed8_min
+    __kmpc_atomic_fixed8_min_cpt
+    __kmpc_atomic_fixed8_mul
+    __kmpc_atomic_fixed8_mul_cpt
+    __kmpc_atomic_fixed8_mul_float8
+    __kmpc_atomic_fixed8_mul_fp
+    __kmpc_atomic_fixed8_neqv
+    __kmpc_atomic_fixed8_neqv_cpt
+    __kmpc_atomic_fixed8_orb
+    __kmpc_atomic_fixed8_orb_cpt
+    __kmpc_atomic_fixed8_orl
+    __kmpc_atomic_fixed8_orl_cpt
+    __kmpc_atomic_fixed8_rd
+    __kmpc_atomic_fixed8_shl
+    __kmpc_atomic_fixed8_shl_cpt
+    __kmpc_atomic_fixed8_shl_cpt_rev
+    __kmpc_atomic_fixed8_shl_rev
+    __kmpc_atomic_fixed8_shr
+    __kmpc_atomic_fixed8_shr_cpt
+    __kmpc_atomic_fixed8_shr_cpt_rev
+    __kmpc_atomic_fixed8_shr_rev
+    __kmpc_atomic_fixed8_sub
+    __kmpc_atomic_fixed8_sub_cpt
+    __kmpc_atomic_fixed8_sub_cpt_rev
+    __kmpc_atomic_fixed8_sub_fp
+    __kmpc_atomic_fixed8_sub_rev
+    __kmpc_atomic_fixed8_swp
+    __kmpc_atomic_fixed8_wr
+    __kmpc_atomic_fixed8_xor
+    __kmpc_atomic_fixed8_xor_cpt
+    __kmpc_atomic_fixed8u_add_fp
+    __kmpc_atomic_fixed8u_sub_fp
+    __kmpc_atomic_fixed8u_mul_fp
+    __kmpc_atomic_fixed8u_div
+    __kmpc_atomic_fixed8u_div_cpt
+    __kmpc_atomic_fixed8u_div_cpt_rev
+    __kmpc_atomic_fixed8u_div_fp
+    __kmpc_atomic_fixed8u_div_rev
+    __kmpc_atomic_fixed8u_shr
+    __kmpc_atomic_fixed8u_shr_cpt
+    __kmpc_atomic_fixed8u_shr_cpt_rev
+    __kmpc_atomic_fixed8u_shr_rev
+@endcode
+
+Functions for floating point
+----------------------------
+There are versions here for floating point numbers of size 4, 8, 10 and 16
+bytes. (Ten byte floats are used by X87, but are now rare).
+@code
+    __kmpc_atomic_float4_add
+    __kmpc_atomic_float4_add_cpt
+    __kmpc_atomic_float4_add_float8
+    __kmpc_atomic_float4_add_fp
+    __kmpc_atomic_float4_div
+    __kmpc_atomic_float4_div_cpt
+    __kmpc_atomic_float4_div_cpt_rev
+    __kmpc_atomic_float4_div_float8
+    __kmpc_atomic_float4_div_fp
+    __kmpc_atomic_float4_div_rev
+    __kmpc_atomic_float4_max
+    __kmpc_atomic_float4_max_cpt
+    __kmpc_atomic_float4_min
+    __kmpc_atomic_float4_min_cpt
+    __kmpc_atomic_float4_mul
+    __kmpc_atomic_float4_mul_cpt
+    __kmpc_atomic_float4_mul_float8
+    __kmpc_atomic_float4_mul_fp
+    __kmpc_atomic_float4_rd
+    __kmpc_atomic_float4_sub
+    __kmpc_atomic_float4_sub_cpt
+    __kmpc_atomic_float4_sub_cpt_rev
+    __kmpc_atomic_float4_sub_float8
+    __kmpc_atomic_float4_sub_fp
+    __kmpc_atomic_float4_sub_rev
+    __kmpc_atomic_float4_swp
+    __kmpc_atomic_float4_wr
+    __kmpc_atomic_float8_add
+    __kmpc_atomic_float8_add_cpt
+    __kmpc_atomic_float8_add_fp
+    __kmpc_atomic_float8_div
+    __kmpc_atomic_float8_div_cpt
+    __kmpc_atomic_float8_div_cpt_rev
+    __kmpc_atomic_float8_div_fp
+    __kmpc_atomic_float8_div_rev
+    __kmpc_atomic_float8_max
+    __kmpc_atomic_float8_max_cpt
+    __kmpc_atomic_float8_min
+    __kmpc_atomic_float8_min_cpt
+    __kmpc_atomic_float8_mul
+    __kmpc_atomic_float8_mul_cpt
+    __kmpc_atomic_float8_mul_fp
+    __kmpc_atomic_float8_rd
+    __kmpc_atomic_float8_sub
+    __kmpc_atomic_float8_sub_cpt
+    __kmpc_atomic_float8_sub_cpt_rev
+    __kmpc_atomic_float8_sub_fp
+    __kmpc_atomic_float8_sub_rev
+    __kmpc_atomic_float8_swp
+    __kmpc_atomic_float8_wr
+    __kmpc_atomic_float10_add
+    __kmpc_atomic_float10_add_cpt
+    __kmpc_atomic_float10_add_fp
+    __kmpc_atomic_float10_div
+    __kmpc_atomic_float10_div_cpt
+    __kmpc_atomic_float10_div_cpt_rev
+    __kmpc_atomic_float10_div_fp
+    __kmpc_atomic_float10_div_rev
+    __kmpc_atomic_float10_mul
+    __kmpc_atomic_float10_mul_cpt
+    __kmpc_atomic_float10_mul_fp
+    __kmpc_atomic_float10_rd
+    __kmpc_atomic_float10_sub
+    __kmpc_atomic_float10_sub_cpt
+    __kmpc_atomic_float10_sub_cpt_rev
+    __kmpc_atomic_float10_sub_fp
+    __kmpc_atomic_float10_sub_rev
+    __kmpc_atomic_float10_swp
+    __kmpc_atomic_float10_wr
+    __kmpc_atomic_float16_add
+    __kmpc_atomic_float16_add_cpt
+    __kmpc_atomic_float16_div
+    __kmpc_atomic_float16_div_cpt
+    __kmpc_atomic_float16_div_cpt_rev
+    __kmpc_atomic_float16_div_rev
+    __kmpc_atomic_float16_max
+    __kmpc_atomic_float16_max_cpt
+    __kmpc_atomic_float16_min
+    __kmpc_atomic_float16_min_cpt
+    __kmpc_atomic_float16_mul
+    __kmpc_atomic_float16_mul_cpt
+    __kmpc_atomic_float16_rd
+    __kmpc_atomic_float16_sub
+    __kmpc_atomic_float16_sub_cpt
+    __kmpc_atomic_float16_sub_cpt_rev
+    __kmpc_atomic_float16_sub_rev
+    __kmpc_atomic_float16_swp
+    __kmpc_atomic_float16_wr
+@endcode
+
+Functions for Complex types
+---------------------------
+Functions for complex types whose component floating point variables are of size
+4,8,10 or 16 bytes. The names here are based on the size of the component float,
+*not* the size of the complex type. So `__kmpc_atomc_cmplx8_add` is an operation
+on a `complex<double>` or `complex(kind=8)`, *not* `complex<float>`.
+
+@code
+    __kmpc_atomic_cmplx4_add
+    __kmpc_atomic_cmplx4_add_cmplx8
+    __kmpc_atomic_cmplx4_add_cpt
+    __kmpc_atomic_cmplx4_div
+    __kmpc_atomic_cmplx4_div_cmplx8
+    __kmpc_atomic_cmplx4_div_cpt
+    __kmpc_atomic_cmplx4_div_cpt_rev
+    __kmpc_atomic_cmplx4_div_rev
+    __kmpc_atomic_cmplx4_mul
+    __kmpc_atomic_cmplx4_mul_cmplx8
+    __kmpc_atomic_cmplx4_mul_cpt
+    __kmpc_atomic_cmplx4_rd
+    __kmpc_atomic_cmplx4_sub
+    __kmpc_atomic_cmplx4_sub_cmplx8
+    __kmpc_atomic_cmplx4_sub_cpt
+    __kmpc_atomic_cmplx4_sub_cpt_rev
+    __kmpc_atomic_cmplx4_sub_rev
+    __kmpc_atomic_cmplx4_swp
+    __kmpc_atomic_cmplx4_wr
+    __kmpc_atomic_cmplx8_add
+    __kmpc_atomic_cmplx8_add_cpt
+    __kmpc_atomic_cmplx8_div
+    __kmpc_atomic_cmplx8_div_cpt
+    __kmpc_atomic_cmplx8_div_cpt_rev
+    __kmpc_atomic_cmplx8_div_rev
+    __kmpc_atomic_cmplx8_mul
+    __kmpc_atomic_cmplx8_mul_cpt
+    __kmpc_atomic_cmplx8_rd
+    __kmpc_atomic_cmplx8_sub
+    __kmpc_atomic_cmplx8_sub_cpt
+    __kmpc_atomic_cmplx8_sub_cpt_rev
+    __kmpc_atomic_cmplx8_sub_rev
+    __kmpc_atomic_cmplx8_swp
+    __kmpc_atomic_cmplx8_wr
+    __kmpc_atomic_cmplx10_add
+    __kmpc_atomic_cmplx10_add_cpt
+    __kmpc_atomic_cmplx10_div
+    __kmpc_atomic_cmplx10_div_cpt
+    __kmpc_atomic_cmplx10_div_cpt_rev
+    __kmpc_atomic_cmplx10_div_rev
+    __kmpc_atomic_cmplx10_mul
+    __kmpc_atomic_cmplx10_mul_cpt
+    __kmpc_atomic_cmplx10_rd
+    __kmpc_atomic_cmplx10_sub
+    __kmpc_atomic_cmplx10_sub_cpt
+    __kmpc_atomic_cmplx10_sub_cpt_rev
+    __kmpc_atomic_cmplx10_sub_rev
+    __kmpc_atomic_cmplx10_swp
+    __kmpc_atomic_cmplx10_wr
+    __kmpc_atomic_cmplx16_add
+    __kmpc_atomic_cmplx16_add_cpt
+    __kmpc_atomic_cmplx16_div
+    __kmpc_atomic_cmplx16_div_cpt
+    __kmpc_atomic_cmplx16_div_cpt_rev
+    __kmpc_atomic_cmplx16_div_rev
+    __kmpc_atomic_cmplx16_mul
+    __kmpc_atomic_cmplx16_mul_cpt
+    __kmpc_atomic_cmplx16_rd
+    __kmpc_atomic_cmplx16_sub
+    __kmpc_atomic_cmplx16_sub_cpt
+    __kmpc_atomic_cmplx16_sub_cpt_rev
+    __kmpc_atomic_cmplx16_swp
+    __kmpc_atomic_cmplx16_wr
+@endcode
+*/
+
+/*!
+@ingroup ATOMIC_OPS
+@{
+*/
+
+/*
+ * Global vars
+ */
+
+#ifndef KMP_GOMP_COMPAT
+int __kmp_atomic_mode = 1; // Intel perf
+#else
+int __kmp_atomic_mode = 2; // GOMP compatibility
+#endif /* KMP_GOMP_COMPAT */
+
+KMP_ALIGN(128)
+
+// Control access to all user coded atomics in Gnu compat mode
+kmp_atomic_lock_t __kmp_atomic_lock;
+// Control access to all user coded atomics for 1-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_1i;
+// Control access to all user coded atomics for 2-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_2i;
+// Control access to all user coded atomics for 4-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_4i;
+// Control access to all user coded atomics for kmp_real32 data type
+kmp_atomic_lock_t __kmp_atomic_lock_4r;
+// Control access to all user coded atomics for 8-byte fixed data types
+kmp_atomic_lock_t __kmp_atomic_lock_8i;
+// Control access to all user coded atomics for kmp_real64 data type
+kmp_atomic_lock_t __kmp_atomic_lock_8r;
+// Control access to all user coded atomics for complex byte data type
+kmp_atomic_lock_t __kmp_atomic_lock_8c;
+// Control access to all user coded atomics for long double data type
+kmp_atomic_lock_t __kmp_atomic_lock_10r;
+// Control access to all user coded atomics for _Quad data type
+kmp_atomic_lock_t __kmp_atomic_lock_16r;
+// Control access to all user coded atomics for double complex data type
+kmp_atomic_lock_t __kmp_atomic_lock_16c;
+// Control access to all user coded atomics for long double complex type
+kmp_atomic_lock_t __kmp_atomic_lock_20c;
+// Control access to all user coded atomics for _Quad complex data type
+kmp_atomic_lock_t __kmp_atomic_lock_32c;
+
+/* 2007-03-02:
+   Without "volatile" specifier in OP_CMPXCHG and MIN_MAX_CMPXCHG we have a bug
+   on *_32 and *_32e. This is just a temporary workaround for the problem. It
+   seems the right solution is writing OP_CMPXCHG and MIN_MAX_CMPXCHG routines
+   in assembler language. */
+#define KMP_ATOMIC_VOLATILE volatile
+
+#if (KMP_ARCH_X86) && KMP_HAVE_QUAD
+
+static inline void operator+=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  lhs.q += rhs.q;
+}
+static inline void operator-=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  lhs.q -= rhs.q;
+}
+static inline void operator*=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  lhs.q *= rhs.q;
+}
+static inline void operator/=(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  lhs.q /= rhs.q;
+}
+static inline bool operator<(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q < rhs.q;
+}
+static inline bool operator>(Quad_a4_t &lhs, Quad_a4_t &rhs) {
+  return lhs.q > rhs.q;
+}
+
+static inline void operator+=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  lhs.q += rhs.q;
+}
+static inline void operator-=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  lhs.q -= rhs.q;
+}
+static inline void operator*=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  lhs.q *= rhs.q;
+}
+static inline void operator/=(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  lhs.q /= rhs.q;
+}
+static inline bool operator<(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q < rhs.q;
+}
+static inline bool operator>(Quad_a16_t &lhs, Quad_a16_t &rhs) {
+  return lhs.q > rhs.q;
+}
+
+static inline void operator+=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
+  lhs.q += rhs.q;
+}
+static inline void operator-=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
+  lhs.q -= rhs.q;
+}
+static inline void operator*=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
+  lhs.q *= rhs.q;
+}
+static inline void operator/=(kmp_cmplx128_a4_t &lhs, kmp_cmplx128_a4_t &rhs) {
+  lhs.q /= rhs.q;
+}
+
+static inline void operator+=(kmp_cmplx128_a16_t &lhs,
+                              kmp_cmplx128_a16_t &rhs) {
+  lhs.q += rhs.q;
+}
+static inline void operator-=(kmp_cmplx128_a16_t &lhs,
+                              kmp_cmplx128_a16_t &rhs) {
+  lhs.q -= rhs.q;
+}
+static inline void operator*=(kmp_cmplx128_a16_t &lhs,
+                              kmp_cmplx128_a16_t &rhs) {
+  lhs.q *= rhs.q;
+}
+static inline void operator/=(kmp_cmplx128_a16_t &lhs,
+                              kmp_cmplx128_a16_t &rhs) {
+  lhs.q /= rhs.q;
+}
+
+#endif // (KMP_ARCH_X86) && KMP_HAVE_QUAD
+
+// ATOMIC implementation routines -----------------------------------------
+// One routine for each operation and operand type.
+// All routines declarations looks like
+// void __kmpc_atomic_RTYPE_OP( ident_t*, int, TYPE *lhs, TYPE rhs );
+
+#define KMP_CHECK_GTID                                                         \
+  if (gtid == KMP_GTID_UNKNOWN) {                                              \
+    gtid = __kmp_entry_gtid();                                                 \
+  } // check and get gtid when needed
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, RET_TYPE)                           \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid,        \
+                                             TYPE *lhs, TYPE rhs) {            \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+// Lock variables used for critical sections for various size operands
+#define ATOMIC_LOCK0 __kmp_atomic_lock // all types, for Gnu compat
+#define ATOMIC_LOCK1i __kmp_atomic_lock_1i // char
+#define ATOMIC_LOCK2i __kmp_atomic_lock_2i // short
+#define ATOMIC_LOCK4i __kmp_atomic_lock_4i // long int
+#define ATOMIC_LOCK4r __kmp_atomic_lock_4r // float
+#define ATOMIC_LOCK8i __kmp_atomic_lock_8i // long long int
+#define ATOMIC_LOCK8r __kmp_atomic_lock_8r // double
+#define ATOMIC_LOCK8c __kmp_atomic_lock_8c // float complex
+#define ATOMIC_LOCK10r __kmp_atomic_lock_10r // long double
+#define ATOMIC_LOCK16r __kmp_atomic_lock_16r // _Quad
+#define ATOMIC_LOCK16c __kmp_atomic_lock_16c // double complex
+#define ATOMIC_LOCK20c __kmp_atomic_lock_20c // long double complex
+#define ATOMIC_LOCK32c __kmp_atomic_lock_32c // _Quad complex
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL(OP, LCK_ID)                                                \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  (*lhs) OP(rhs);                                                              \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// ------------------------------------------------------------------------
+// For GNU compatibility, we may need to use a critical section,
+// even though it is not required by the ISA.
+//
+// On IA-32 architecture, all atomic operations except for fixed 4 byte add,
+// sub, and bitwise logical ops, and 1 & 2 byte logical ops use a common
+// critical section.  On Intel(R) 64, all atomic operations are done with fetch
+// and add or compare and exchange.  Therefore, the FLAG parameter to this
+// macro is either KMP_ARCH_X86 or 0 (or 1, for Intel-specific extension which
+// require a critical section, where we predict that they will be implemented
+// in the Gnu codegen by calling GOMP_atomic_start() / GOMP_atomic_end()).
+//
+// When the OP_GOMP_CRITICAL macro is used in a *CRITICAL* macro construct,
+// the FLAG parameter should always be 1.  If we know that we will be using
+// a critical section, then we want to make certain that we use the generic
+// lock __kmp_atomic_lock to protect the atomic update, and not of of the
+// locks that are specialized based upon the size or type of the data.
+//
+// If FLAG is 0, then we are relying on dead code elimination by the build
+// compiler to get rid of the useless block of code, and save a needless
+// branch at runtime.
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL(OP, FLAG)                                             \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(OP, 0);                                                        \
+    return;                                                                    \
+  }
+#else
+#define OP_GOMP_CRITICAL(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+#if KMP_MIC
+#define KMP_DO_PAUSE _mm_delay_32(1)
+#else
+#define KMP_DO_PAUSE KMP_CPU_PAUSE()
+#endif /* KMP_MIC */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+#define OP_CMPXCHG(TYPE, BITS, OP)                                             \
+  {                                                                            \
+    TYPE old_value, new_value;                                                 \
+    old_value = *(TYPE volatile *)lhs;                                         \
+    new_value = old_value OP rhs;                                              \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      old_value = *(TYPE volatile *)lhs;                                       \
+      new_value = old_value OP rhs;                                            \
+    }                                                                          \
+  }
+
+#if USE_CMPXCHG_FIX
+// 2007-06-25:
+// workaround for C78287 (complex(kind=4) data type). lin_32, lin_32e, win_32
+// and win_32e are affected (I verified the asm). Compiler ignores the volatile
+// qualifier of the temp_val in the OP_CMPXCHG macro. This is a problem of the
+// compiler. Related tracker is C76005, targeted to 11.0. I verified the asm of
+// the workaround.
+#define OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                  \
+  {                                                                            \
+    struct _sss {                                                              \
+      TYPE cmp;                                                                \
+      kmp_int##BITS *vvv;                                                      \
+    };                                                                         \
+    struct _sss old_value, new_value;                                          \
+    old_value.vvv = (kmp_int##BITS *)&old_value.cmp;                           \
+    new_value.vvv = (kmp_int##BITS *)&new_value.cmp;                           \
+    *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                           \
+    new_value.cmp = old_value.cmp OP rhs;                                      \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) old_value.vvv,   \
+        *VOLATILE_CAST(kmp_int##BITS *) new_value.vvv)) {                      \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      *old_value.vvv = *(volatile kmp_int##BITS *)lhs;                         \
+      new_value.cmp = old_value.cmp OP rhs;                                    \
+    }                                                                          \
+  }
+// end of the first part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define ATOMIC_FIXED_ADD(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */            \
+  KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                                        \
+  }
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,           \
+                       GOMP_FLAG)                                              \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+#if USE_CMPXCHG_FIX
+// -------------------------------------------------------------------------
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID,      \
+                                  MASK, GOMP_FLAG)                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                        \
+  }
+// end of the second part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+
+#else
+// -------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_FIXED_ADD(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */          \
+    KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                                      \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+  }                                                                            \
+  }
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,           \
+                       GOMP_FLAG)                                              \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+  }                                                                            \
+  }
+#if USE_CMPXCHG_FIX
+// -------------------------------------------------------------------------
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_WORKAROUND(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID,      \
+                                  MASK, GOMP_FLAG)                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+  }                                                                            \
+  }
+// end of the second part of the workaround for C78287
+#endif // USE_CMPXCHG_FIX
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// Routines for ATOMIC 4-byte operands addition and subtraction
+ATOMIC_FIXED_ADD(fixed4, add, kmp_int32, 32, +, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_add
+ATOMIC_FIXED_ADD(fixed4, sub, kmp_int32, 32, -, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_sub
+
+ATOMIC_CMPXCHG(float4, add, kmp_real32, 32, +, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_add
+ATOMIC_CMPXCHG(float4, sub, kmp_real32, 32, -, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_sub
+
+// Routines for ATOMIC 8-byte operands addition and subtraction
+ATOMIC_FIXED_ADD(fixed8, add, kmp_int64, 64, +, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_add
+ATOMIC_FIXED_ADD(fixed8, sub, kmp_int64, 64, -, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub
+
+ATOMIC_CMPXCHG(float8, add, kmp_real64, 64, +, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_add
+ATOMIC_CMPXCHG(float8, sub, kmp_real64, 64, -, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_sub
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+//     MASK    - used for alignment check
+
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,LCK_ID,MASK,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//              TYPE_ID,OP_ID, TYPE,          OP, LCK_ID, GOMP_FLAG
+ATOMIC_CMPXCHG(fixed1, add, kmp_int8, 8, +, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_add
+ATOMIC_CMPXCHG(fixed1, andb, kmp_int8, 8, &, 1i, 0,
+               0) // __kmpc_atomic_fixed1_andb
+ATOMIC_CMPXCHG(fixed1, div, kmp_int8, 8, /, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_div
+ATOMIC_CMPXCHG(fixed1u, div, kmp_uint8, 8, /, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div
+ATOMIC_CMPXCHG(fixed1, mul, kmp_int8, 8, *, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul
+ATOMIC_CMPXCHG(fixed1, orb, kmp_int8, 8, |, 1i, 0,
+               0) // __kmpc_atomic_fixed1_orb
+ATOMIC_CMPXCHG(fixed1, shl, kmp_int8, 8, <<, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl
+ATOMIC_CMPXCHG(fixed1, shr, kmp_int8, 8, >>, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr
+ATOMIC_CMPXCHG(fixed1u, shr, kmp_uint8, 8, >>, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr
+ATOMIC_CMPXCHG(fixed1, sub, kmp_int8, 8, -, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub
+ATOMIC_CMPXCHG(fixed1, xor, kmp_int8, 8, ^, 1i, 0,
+               0) // __kmpc_atomic_fixed1_xor
+ATOMIC_CMPXCHG(fixed2, add, kmp_int16, 16, +, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_add
+ATOMIC_CMPXCHG(fixed2, andb, kmp_int16, 16, &, 2i, 1,
+               0) // __kmpc_atomic_fixed2_andb
+ATOMIC_CMPXCHG(fixed2, div, kmp_int16, 16, /, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_div
+ATOMIC_CMPXCHG(fixed2u, div, kmp_uint16, 16, /, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div
+ATOMIC_CMPXCHG(fixed2, mul, kmp_int16, 16, *, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul
+ATOMIC_CMPXCHG(fixed2, orb, kmp_int16, 16, |, 2i, 1,
+               0) // __kmpc_atomic_fixed2_orb
+ATOMIC_CMPXCHG(fixed2, shl, kmp_int16, 16, <<, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl
+ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, >>, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr
+ATOMIC_CMPXCHG(fixed2u, shr, kmp_uint16, 16, >>, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr
+ATOMIC_CMPXCHG(fixed2, sub, kmp_int16, 16, -, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub
+ATOMIC_CMPXCHG(fixed2, xor, kmp_int16, 16, ^, 2i, 1,
+               0) // __kmpc_atomic_fixed2_xor
+ATOMIC_CMPXCHG(fixed4, andb, kmp_int32, 32, &, 4i, 3,
+               0) // __kmpc_atomic_fixed4_andb
+ATOMIC_CMPXCHG(fixed4, div, kmp_int32, 32, /, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_div
+ATOMIC_CMPXCHG(fixed4u, div, kmp_uint32, 32, /, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div
+ATOMIC_CMPXCHG(fixed4, mul, kmp_int32, 32, *, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_mul
+ATOMIC_CMPXCHG(fixed4, orb, kmp_int32, 32, |, 4i, 3,
+               0) // __kmpc_atomic_fixed4_orb
+ATOMIC_CMPXCHG(fixed4, shl, kmp_int32, 32, <<, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl
+ATOMIC_CMPXCHG(fixed4, shr, kmp_int32, 32, >>, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr
+ATOMIC_CMPXCHG(fixed4u, shr, kmp_uint32, 32, >>, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr
+ATOMIC_CMPXCHG(fixed4, xor, kmp_int32, 32, ^, 4i, 3,
+               0) // __kmpc_atomic_fixed4_xor
+ATOMIC_CMPXCHG(fixed8, andb, kmp_int64, 64, &, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_andb
+ATOMIC_CMPXCHG(fixed8, div, kmp_int64, 64, /, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_div
+ATOMIC_CMPXCHG(fixed8u, div, kmp_uint64, 64, /, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div
+ATOMIC_CMPXCHG(fixed8, mul, kmp_int64, 64, *, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul
+ATOMIC_CMPXCHG(fixed8, orb, kmp_int64, 64, |, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_orb
+ATOMIC_CMPXCHG(fixed8, shl, kmp_int64, 64, <<, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl
+ATOMIC_CMPXCHG(fixed8, shr, kmp_int64, 64, >>, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr
+ATOMIC_CMPXCHG(fixed8u, shr, kmp_uint64, 64, >>, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr
+ATOMIC_CMPXCHG(fixed8, xor, kmp_int64, 64, ^, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_xor
+ATOMIC_CMPXCHG(float4, div, kmp_real32, 32, /, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_div
+ATOMIC_CMPXCHG(float4, mul, kmp_real32, 32, *, 4r, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_float4_mul
+ATOMIC_CMPXCHG(float8, div, kmp_real64, 64, /, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_div
+ATOMIC_CMPXCHG(float8, mul, kmp_real64, 64, *, 8r, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_float8_mul
+//              TYPE_ID,OP_ID, TYPE,          OP, LCK_ID, GOMP_FLAG
+
+/* ------------------------------------------------------------------------ */
+/* Routines for C/C++ Reduction operators && and ||                         */
+
+// ------------------------------------------------------------------------
+// Need separate macros for &&, || because there is no combined assignment
+//   TODO: eliminate ATOMIC_CRIT_{L,EQV} macros as not used
+#define ATOMIC_CRIT_L(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(= *lhs OP, GOMP_FLAG)                                       \
+  OP_CRITICAL(= *lhs OP, LCK_ID)                                               \
+  }
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ===================================
+#define ATOMIC_CMPX_L(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK, GOMP_FLAG) \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(= *lhs OP, GOMP_FLAG)                                       \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPX_L(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK, GOMP_FLAG) \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(= *lhs OP, GOMP_FLAG)                                       \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(= *lhs OP, LCK_ID) /* unaligned - use critical */              \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPX_L(fixed1, andl, char, 8, &&, 1i, 0,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed1_andl
+ATOMIC_CMPX_L(fixed1, orl, char, 8, ||, 1i, 0,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed1_orl
+ATOMIC_CMPX_L(fixed2, andl, short, 16, &&, 2i, 1,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed2_andl
+ATOMIC_CMPX_L(fixed2, orl, short, 16, ||, 2i, 1,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed2_orl
+ATOMIC_CMPX_L(fixed4, andl, kmp_int32, 32, &&, 4i, 3,
+              0) // __kmpc_atomic_fixed4_andl
+ATOMIC_CMPX_L(fixed4, orl, kmp_int32, 32, ||, 4i, 3,
+              0) // __kmpc_atomic_fixed4_orl
+ATOMIC_CMPX_L(fixed8, andl, kmp_int64, 64, &&, 8i, 7,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed8_andl
+ATOMIC_CMPX_L(fixed8, orl, kmp_int64, 64, ||, 8i, 7,
+              KMP_ARCH_X86) // __kmpc_atomic_fixed8_orl
+
+/* ------------------------------------------------------------------------- */
+/* Routines for Fortran operators that matched no one in C:                  */
+/* MAX, MIN, .EQV., .NEQV.                                                   */
+/* Operators .AND., .OR. are covered by __kmpc_atomic_*_{andl,orl}           */
+/* Intrinsics IAND, IOR, IEOR are covered by __kmpc_atomic_*_{andb,orb,xor}  */
+
+// -------------------------------------------------------------------------
+// MIN and MAX need separate macros
+// OP - operator to check if we need any actions?
+#define MIN_MAX_CRITSECT(OP, LCK_ID)                                           \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (*lhs OP rhs) { /* still need actions? */                                 \
+    *lhs = rhs;                                                                \
+  }                                                                            \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_MIN_MAX_CRITSECT(OP, FLAG)                                        \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    MIN_MAX_CRITSECT(OP, 0);                                                   \
+    return;                                                                    \
+  }
+#else
+#define GOMP_MIN_MAX_CRITSECT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define MIN_MAX_CMPXCHG(TYPE, BITS, OP)                                        \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value;                                                            \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    while (old_value OP rhs && /* still need actions? */                       \
+           !KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+               (kmp_int##BITS *)lhs,                                           \
+               *VOLATILE_CAST(kmp_int##BITS *) & old_value,                    \
+               *VOLATILE_CAST(kmp_int##BITS *) & rhs)) {                       \
+      KMP_CPU_PAUSE();                                                         \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+    }                                                                          \
+  }
+
+// -------------------------------------------------------------------------
+// 1-byte, 2-byte operands - use critical section
+#define MIN_MAX_CRITICAL(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)          \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  if (*lhs OP rhs) { /* need actions? */                                       \
+    GOMP_MIN_MAX_CRITSECT(OP, GOMP_FLAG)                                       \
+    MIN_MAX_CRITSECT(OP, LCK_ID)                                               \
+  }                                                                            \
+  }
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// -------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define MIN_MAX_COMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  if (*lhs OP rhs) {                                                           \
+    GOMP_MIN_MAX_CRITSECT(OP, GOMP_FLAG)                                       \
+    MIN_MAX_CMPXCHG(TYPE, BITS, OP)                                            \
+  }                                                                            \
+  }
+
+#else
+// -------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define MIN_MAX_COMPXCHG(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,         \
+                         GOMP_FLAG)                                            \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  if (*lhs OP rhs) {                                                           \
+    GOMP_MIN_MAX_CRITSECT(OP, GOMP_FLAG)                                       \
+    if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                    \
+      MIN_MAX_CMPXCHG(TYPE, BITS, OP) /* aligned address */                    \
+    } else {                                                                   \
+      KMP_CHECK_GTID;                                                          \
+      MIN_MAX_CRITSECT(OP, LCK_ID) /* unaligned address */                     \
+    }                                                                          \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+MIN_MAX_COMPXCHG(fixed1, max, char, 8, <, 1i, 0,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed1_max
+MIN_MAX_COMPXCHG(fixed1, min, char, 8, >, 1i, 0,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed1_min
+MIN_MAX_COMPXCHG(fixed2, max, short, 16, <, 2i, 1,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed2_max
+MIN_MAX_COMPXCHG(fixed2, min, short, 16, >, 2i, 1,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed2_min
+MIN_MAX_COMPXCHG(fixed4, max, kmp_int32, 32, <, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_max
+MIN_MAX_COMPXCHG(fixed4, min, kmp_int32, 32, >, 4i, 3,
+                 0) // __kmpc_atomic_fixed4_min
+MIN_MAX_COMPXCHG(fixed8, max, kmp_int64, 64, <, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_max
+MIN_MAX_COMPXCHG(fixed8, min, kmp_int64, 64, >, 8i, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_fixed8_min
+MIN_MAX_COMPXCHG(float4, max, kmp_real32, 32, <, 4r, 3,
+                 KMP_ARCH_X86) // __kmpc_atomic_float4_max
+MIN_MAX_COMPXCHG(float4, min, kmp_real32, 32, >, 4r, 3,
+                 KMP_ARCH_X86) // __kmpc_atomic_float4_min
+MIN_MAX_COMPXCHG(float8, max, kmp_real64, 64, <, 8r, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_float8_max
+MIN_MAX_COMPXCHG(float8, min, kmp_real64, 64, >, 8r, 7,
+                 KMP_ARCH_X86) // __kmpc_atomic_float8_min
+#if KMP_HAVE_QUAD
+MIN_MAX_CRITICAL(float16, max, QUAD_LEGACY, <, 16r,
+                 1) // __kmpc_atomic_float16_max
+MIN_MAX_CRITICAL(float16, min, QUAD_LEGACY, >, 16r,
+                 1) // __kmpc_atomic_float16_min
+#if (KMP_ARCH_X86)
+MIN_MAX_CRITICAL(float16, max_a16, Quad_a16_t, <, 16r,
+                 1) // __kmpc_atomic_float16_max_a16
+MIN_MAX_CRITICAL(float16, min_a16, Quad_a16_t, >, 16r,
+                 1) // __kmpc_atomic_float16_min_a16
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+// ------------------------------------------------------------------------
+// Need separate macros for .EQV. because of the need of complement (~)
+// OP ignored for critical sections, ^=~ used instead
+#define ATOMIC_CRIT_EQV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)           \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(^= ~, GOMP_FLAG) /* send assignment */                      \
+  OP_CRITICAL(^= ~, LCK_ID) /* send assignment and complement */               \
+  }
+
+// ------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ===================================
+#define ATOMIC_CMPX_EQV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,          \
+                        GOMP_FLAG)                                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(^= ~, GOMP_FLAG) /* send assignment */                      \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+// ------------------------------------------------------------------------
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPX_EQV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, MASK,          \
+                        GOMP_FLAG)                                             \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(^= ~, GOMP_FLAG)                                            \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(^= ~, LCK_ID) /* unaligned address - use critical */           \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPXCHG(fixed1, neqv, kmp_int8, 8, ^, 1i, 0,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_neqv
+ATOMIC_CMPXCHG(fixed2, neqv, kmp_int16, 16, ^, 2i, 1,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_neqv
+ATOMIC_CMPXCHG(fixed4, neqv, kmp_int32, 32, ^, 4i, 3,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_neqv
+ATOMIC_CMPXCHG(fixed8, neqv, kmp_int64, 64, ^, 8i, 7,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_neqv
+ATOMIC_CMPX_EQV(fixed1, eqv, kmp_int8, 8, ^~, 1i, 0,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed1_eqv
+ATOMIC_CMPX_EQV(fixed2, eqv, kmp_int16, 16, ^~, 2i, 1,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed2_eqv
+ATOMIC_CMPX_EQV(fixed4, eqv, kmp_int32, 32, ^~, 4i, 3,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed4_eqv
+ATOMIC_CMPX_EQV(fixed8, eqv, kmp_int64, 64, ^~, 8i, 7,
+                KMP_ARCH_X86) // __kmpc_atomic_fixed8_eqv
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)           \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG) /* send assignment */                    \
+  OP_CRITICAL(OP## =, LCK_ID) /* send assignment */                            \
+  }
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL(float10, add, long double, +, 10r,
+                1) // __kmpc_atomic_float10_add
+ATOMIC_CRITICAL(float10, sub, long double, -, 10r,
+                1) // __kmpc_atomic_float10_sub
+ATOMIC_CRITICAL(float10, mul, long double, *, 10r,
+                1) // __kmpc_atomic_float10_mul
+ATOMIC_CRITICAL(float10, div, long double, /, 10r,
+                1) // __kmpc_atomic_float10_div
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL(float16, add, QUAD_LEGACY, +, 16r,
+                1) // __kmpc_atomic_float16_add
+ATOMIC_CRITICAL(float16, sub, QUAD_LEGACY, -, 16r,
+                1) // __kmpc_atomic_float16_sub
+ATOMIC_CRITICAL(float16, mul, QUAD_LEGACY, *, 16r,
+                1) // __kmpc_atomic_float16_mul
+ATOMIC_CRITICAL(float16, div, QUAD_LEGACY, /, 16r,
+                1) // __kmpc_atomic_float16_div
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL(float16, add_a16, Quad_a16_t, +, 16r,
+                1) // __kmpc_atomic_float16_add_a16
+ATOMIC_CRITICAL(float16, sub_a16, Quad_a16_t, -, 16r,
+                1) // __kmpc_atomic_float16_sub_a16
+ATOMIC_CRITICAL(float16, mul_a16, Quad_a16_t, *, 16r,
+                1) // __kmpc_atomic_float16_mul_a16
+ATOMIC_CRITICAL(float16, div_a16, Quad_a16_t, /, 16r,
+                1) // __kmpc_atomic_float16_div_a16
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+// routines for complex types
+
+#if USE_CMPXCHG_FIX
+// workaround for C78287 (complex(kind=4) data type)
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, add, kmp_cmplx32, 64, +, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_add
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, sub, kmp_cmplx32, 64, -, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_sub
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, mul, kmp_cmplx32, 64, *, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_mul
+ATOMIC_CMPXCHG_WORKAROUND(cmplx4, div, kmp_cmplx32, 64, /, 8c, 7,
+                          1) // __kmpc_atomic_cmplx4_div
+// end of the workaround for C78287
+#else
+ATOMIC_CRITICAL(cmplx4, add, kmp_cmplx32, +, 8c, 1) // __kmpc_atomic_cmplx4_add
+ATOMIC_CRITICAL(cmplx4, sub, kmp_cmplx32, -, 8c, 1) // __kmpc_atomic_cmplx4_sub
+ATOMIC_CRITICAL(cmplx4, mul, kmp_cmplx32, *, 8c, 1) // __kmpc_atomic_cmplx4_mul
+ATOMIC_CRITICAL(cmplx4, div, kmp_cmplx32, /, 8c, 1) // __kmpc_atomic_cmplx4_div
+#endif // USE_CMPXCHG_FIX
+
+ATOMIC_CRITICAL(cmplx8, add, kmp_cmplx64, +, 16c, 1) // __kmpc_atomic_cmplx8_add
+ATOMIC_CRITICAL(cmplx8, sub, kmp_cmplx64, -, 16c, 1) // __kmpc_atomic_cmplx8_sub
+ATOMIC_CRITICAL(cmplx8, mul, kmp_cmplx64, *, 16c, 1) // __kmpc_atomic_cmplx8_mul
+ATOMIC_CRITICAL(cmplx8, div, kmp_cmplx64, /, 16c, 1) // __kmpc_atomic_cmplx8_div
+ATOMIC_CRITICAL(cmplx10, add, kmp_cmplx80, +, 20c,
+                1) // __kmpc_atomic_cmplx10_add
+ATOMIC_CRITICAL(cmplx10, sub, kmp_cmplx80, -, 20c,
+                1) // __kmpc_atomic_cmplx10_sub
+ATOMIC_CRITICAL(cmplx10, mul, kmp_cmplx80, *, 20c,
+                1) // __kmpc_atomic_cmplx10_mul
+ATOMIC_CRITICAL(cmplx10, div, kmp_cmplx80, /, 20c,
+                1) // __kmpc_atomic_cmplx10_div
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL(cmplx16, add, CPLX128_LEG, +, 32c,
+                1) // __kmpc_atomic_cmplx16_add
+ATOMIC_CRITICAL(cmplx16, sub, CPLX128_LEG, -, 32c,
+                1) // __kmpc_atomic_cmplx16_sub
+ATOMIC_CRITICAL(cmplx16, mul, CPLX128_LEG, *, 32c,
+                1) // __kmpc_atomic_cmplx16_mul
+ATOMIC_CRITICAL(cmplx16, div, CPLX128_LEG, /, 32c,
+                1) // __kmpc_atomic_cmplx16_div
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL(cmplx16, add_a16, kmp_cmplx128_a16_t, +, 32c,
+                1) // __kmpc_atomic_cmplx16_add_a16
+ATOMIC_CRITICAL(cmplx16, sub_a16, kmp_cmplx128_a16_t, -, 32c,
+                1) // __kmpc_atomic_cmplx16_sub_a16
+ATOMIC_CRITICAL(cmplx16, mul_a16, kmp_cmplx128_a16_t, *, 32c,
+                1) // __kmpc_atomic_cmplx16_mul_a16
+ATOMIC_CRITICAL(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
+                1) // __kmpc_atomic_cmplx16_div_a16
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// OpenMP 4.0: x = expr binop x for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_REV(OP, LCK_ID)                                            \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  (*lhs) = (rhs)OP(*lhs);                                                      \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_REV(OP, FLAG)                                         \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_REV(OP, 0);                                                    \
+    return;                                                                    \
+  }
+#else
+#define OP_GOMP_CRITICAL_REV(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, RET_TYPE)                       \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID##_rev(ident_t *id_ref, int gtid,  \
+                                                   TYPE *lhs, TYPE rhs) {      \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_rev: T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_REV(TYPE, BITS, OP)                                         \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = rhs OP old_value;                                              \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_DO_PAUSE;                                                            \
+                                                                               \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = rhs OP old_value;                                            \
+    }                                                                          \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, LCK_ID, GOMP_FLAG)  \
+  ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, void)                                 \
+  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
+  OP_CMPXCHG_REV(TYPE, BITS, OP)                                               \
+  }
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,LCK_ID,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//                  TYPE_ID,OP_ID, TYPE,    BITS, OP, LCK_ID, GOMP_FLAG
+ATOMIC_CMPXCHG_REV(fixed1, div, kmp_int8, 8, /, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_rev
+ATOMIC_CMPXCHG_REV(fixed1u, div, kmp_uint8, 8, /, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_rev
+ATOMIC_CMPXCHG_REV(fixed1, shl, kmp_int8, 8, <<, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl_rev
+ATOMIC_CMPXCHG_REV(fixed1, shr, kmp_int8, 8, >>, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr_rev
+ATOMIC_CMPXCHG_REV(fixed1u, shr, kmp_uint8, 8, >>, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed1, sub, kmp_int8, 8, -, 1i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_rev
+
+ATOMIC_CMPXCHG_REV(fixed2, div, kmp_int16, 16, /, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_rev
+ATOMIC_CMPXCHG_REV(fixed2u, div, kmp_uint16, 16, /, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_rev
+ATOMIC_CMPXCHG_REV(fixed2, shl, kmp_int16, 16, <<, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl_rev
+ATOMIC_CMPXCHG_REV(fixed2, shr, kmp_int16, 16, >>, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr_rev
+ATOMIC_CMPXCHG_REV(fixed2u, shr, kmp_uint16, 16, >>, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed2, sub, kmp_int16, 16, -, 2i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_rev
+
+ATOMIC_CMPXCHG_REV(fixed4, div, kmp_int32, 32, /, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_div_rev
+ATOMIC_CMPXCHG_REV(fixed4u, div, kmp_uint32, 32, /, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div_rev
+ATOMIC_CMPXCHG_REV(fixed4, shl, kmp_int32, 32, <<, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl_rev
+ATOMIC_CMPXCHG_REV(fixed4, shr, kmp_int32, 32, >>, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr_rev
+ATOMIC_CMPXCHG_REV(fixed4u, shr, kmp_uint32, 32, >>, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed4, sub, kmp_int32, 32, -, 4i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_sub_rev
+
+ATOMIC_CMPXCHG_REV(fixed8, div, kmp_int64, 64, /, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_rev
+ATOMIC_CMPXCHG_REV(fixed8u, div, kmp_uint64, 64, /, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_rev
+ATOMIC_CMPXCHG_REV(fixed8, shl, kmp_int64, 64, <<, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl_rev
+ATOMIC_CMPXCHG_REV(fixed8, shr, kmp_int64, 64, >>, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr_rev
+ATOMIC_CMPXCHG_REV(fixed8u, shr, kmp_uint64, 64, >>, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr_rev
+ATOMIC_CMPXCHG_REV(fixed8, sub, kmp_int64, 64, -, 8i,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_rev
+
+ATOMIC_CMPXCHG_REV(float4, div, kmp_real32, 32, /, 4r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_rev
+ATOMIC_CMPXCHG_REV(float4, sub, kmp_real32, 32, -, 4r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_rev
+
+ATOMIC_CMPXCHG_REV(float8, div, kmp_real64, 64, /, 8r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_div_rev
+ATOMIC_CMPXCHG_REV(float8, sub, kmp_real64, 64, -, 8r,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_sub_rev
+//                  TYPE_ID,OP_ID, TYPE,     BITS,OP,LCK_ID, GOMP_FLAG
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_REV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)       \
+  ATOMIC_BEGIN_REV(TYPE_ID, OP_ID, TYPE, void)                                 \
+  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
+  OP_CRITICAL_REV(OP, LCK_ID)                                                  \
+  }
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_REV(float10, sub, long double, -, 10r,
+                    1) // __kmpc_atomic_float10_sub_rev
+ATOMIC_CRITICAL_REV(float10, div, long double, /, 10r,
+                    1) // __kmpc_atomic_float10_div_rev
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_REV(float16, sub, QUAD_LEGACY, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_rev
+ATOMIC_CRITICAL_REV(float16, div, QUAD_LEGACY, /, 16r,
+                    1) // __kmpc_atomic_float16_div_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_REV(float16, sub_a16, Quad_a16_t, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_a16_rev
+ATOMIC_CRITICAL_REV(float16, div_a16, Quad_a16_t, /, 16r,
+                    1) // __kmpc_atomic_float16_div_a16_rev
+#endif // KMP_ARCH_X86
+#endif // KMP_HAVE_QUAD
+
+// routines for complex types
+ATOMIC_CRITICAL_REV(cmplx4, sub, kmp_cmplx32, -, 8c,
+                    1) // __kmpc_atomic_cmplx4_sub_rev
+ATOMIC_CRITICAL_REV(cmplx4, div, kmp_cmplx32, /, 8c,
+                    1) // __kmpc_atomic_cmplx4_div_rev
+ATOMIC_CRITICAL_REV(cmplx8, sub, kmp_cmplx64, -, 16c,
+                    1) // __kmpc_atomic_cmplx8_sub_rev
+ATOMIC_CRITICAL_REV(cmplx8, div, kmp_cmplx64, /, 16c,
+                    1) // __kmpc_atomic_cmplx8_div_rev
+ATOMIC_CRITICAL_REV(cmplx10, sub, kmp_cmplx80, -, 20c,
+                    1) // __kmpc_atomic_cmplx10_sub_rev
+ATOMIC_CRITICAL_REV(cmplx10, div, kmp_cmplx80, /, 20c,
+                    1) // __kmpc_atomic_cmplx10_div_rev
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_REV(cmplx16, sub, CPLX128_LEG, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_rev
+ATOMIC_CRITICAL_REV(cmplx16, div, CPLX128_LEG, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_REV(cmplx16, sub_a16, kmp_cmplx128_a16_t, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_a16_rev
+ATOMIC_CRITICAL_REV(cmplx16, div_a16, kmp_cmplx128_a16_t, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_a16_rev
+#endif // KMP_ARCH_X86
+#endif // KMP_HAVE_QUAD
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+// End of OpenMP 4.0: x = expr binop x for non-commutative operations.
+
+/* ------------------------------------------------------------------------ */
+/* Routines for mixed types of LHS and RHS, when RHS is "larger"            */
+/* Note: in order to reduce the total number of types combinations          */
+/*       it is supposed that compiler converts RHS to longest floating type,*/
+/*       that is _Quad, before call to any of these routines                */
+/* Conversion to _Quad will be done by the compiler during calculation,     */
+/*    conversion back to TYPE - before the assignment, like:                */
+/*    *lhs = (TYPE)( (_Quad)(*lhs) OP rhs )                                 */
+/* Performance penalty expected because of SW emulation use                 */
+/* ------------------------------------------------------------------------ */
+
+#define ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                \
+  void __kmpc_atomic_##TYPE_ID##_##OP_ID##_##RTYPE_ID(                         \
+      ident_t *id_ref, int gtid, TYPE *lhs, RTYPE rhs) {                       \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100,                                                              \
+             ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_" #RTYPE_ID ": T#%d\n",   \
+              gtid));
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_FP(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE, LCK_ID,  \
+                           GOMP_FLAG)                                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG) /* send assignment */                    \
+  OP_CRITICAL(OP## =, LCK_ID) /* send assignment */                            \
+  }
+
+// -------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// -------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#define ATOMIC_CMPXCHG_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,    \
+                           LCK_ID, MASK, GOMP_FLAG)                            \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+// -------------------------------------------------------------------------
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPXCHG_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,    \
+                           LCK_ID, MASK, GOMP_FLAG)                            \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// -------------------------------------------------------------------------
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_REV_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID,       \
+                               RTYPE, LCK_ID, MASK, GOMP_FLAG)                 \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
+  OP_CMPXCHG_REV(TYPE, BITS, OP)                                               \
+  }
+#define ATOMIC_CRITICAL_REV_FP(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE,      \
+                               LCK_ID, GOMP_FLAG)                              \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL_REV(OP, GOMP_FLAG)                                          \
+  OP_CRITICAL_REV(OP, LCK_ID)                                                  \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// RHS=float8
+ATOMIC_CMPXCHG_MIX(fixed1, char, mul, 8, *, float8, kmp_real64, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed1, char, div, 8, /, float8, kmp_real64, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_float8
+ATOMIC_CMPXCHG_MIX(fixed2, short, mul, 16, *, float8, kmp_real64, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed2, short, div, 16, /, float8, kmp_real64, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_float8
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, mul, 32, *, float8, kmp_real64, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, div, 32, /, float8, kmp_real64, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_div_float8
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, mul, 64, *, float8, kmp_real64, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_float8
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, div, 64, /, float8, kmp_real64, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, add, 32, +, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_add_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, sub, 32, -, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, mul, 32, *, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_mul_float8
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, div, 32, /, float8, kmp_real64, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_float8
+
+// RHS=float16 (deprecated, to be removed when we are sure the compiler does not
+// use them)
+#if KMP_HAVE_QUAD
+ATOMIC_CMPXCHG_MIX(fixed1, char, add, 8, +, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_add_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, add, 8, +, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed1, char, sub, 8, -, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, sub, 8, -, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed1, char, mul, 8, *, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, mul, 8, *, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed1, char, div, 8, /, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_fp
+ATOMIC_CMPXCHG_MIX(fixed1u, uchar, div, 8, /, fp, _Quad, 1i, 0,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_fp
+
+ATOMIC_CMPXCHG_MIX(fixed2, short, add, 16, +, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_add_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, add, 16, +, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed2, short, sub, 16, -, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, sub, 16, -, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed2, short, mul, 16, *, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, mul, 16, *, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed2, short, div, 16, /, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_fp
+ATOMIC_CMPXCHG_MIX(fixed2u, ushort, div, 16, /, fp, _Quad, 2i, 1,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_fp
+
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, add, 32, +, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_add_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, add, 32, +, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, sub, 32, -, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, sub, 32, -, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, mul, 32, *, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, mul, 32, *, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed4, kmp_int32, div, 32, /, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4_div_fp
+ATOMIC_CMPXCHG_MIX(fixed4u, kmp_uint32, div, 32, /, fp, _Quad, 4i, 3,
+                   0) // __kmpc_atomic_fixed4u_div_fp
+
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, add, 64, +, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_add_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, add, 64, +, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_add_fp
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, sub, 64, -, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, sub, 64, -, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_fp
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, mul, 64, *, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, mul, 64, *, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_mul_fp
+ATOMIC_CMPXCHG_MIX(fixed8, kmp_int64, div, 64, /, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_fp
+ATOMIC_CMPXCHG_MIX(fixed8u, kmp_uint64, div, 64, /, fp, _Quad, 8i, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_fp
+
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, add, 32, +, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_add_fp
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, sub, 32, -, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_fp
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, mul, 32, *, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_mul_fp
+ATOMIC_CMPXCHG_MIX(float4, kmp_real32, div, 32, /, fp, _Quad, 4r, 3,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_fp
+
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, add, 64, +, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_add_fp
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, sub, 64, -, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_sub_fp
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, mul, 64, *, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_mul_fp
+ATOMIC_CMPXCHG_MIX(float8, kmp_real64, div, 64, /, fp, _Quad, 8r, 7,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_div_fp
+
+ATOMIC_CRITICAL_FP(float10, long double, add, +, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_add_fp
+ATOMIC_CRITICAL_FP(float10, long double, sub, -, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_sub_fp
+ATOMIC_CRITICAL_FP(float10, long double, mul, *, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_mul_fp
+ATOMIC_CRITICAL_FP(float10, long double, div, /, fp, _Quad, 10r,
+                   1) // __kmpc_atomic_float10_div_fp
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// Reverse operations
+ATOMIC_CMPXCHG_REV_MIX(fixed1, char, sub_rev, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed1u, uchar, sub_rev, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed1, char, div_rev, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed1u, uchar, div_rev, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(fixed2, short, sub_rev, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed2u, ushort, sub_rev, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed2, short, div_rev, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed2u, ushort, div_rev, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(fixed4, kmp_int32, sub_rev, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed4u, kmp_uint32, sub_rev, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed4, kmp_int32, div_rev, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed4u, kmp_uint32, div_rev, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(fixed8, kmp_int64, sub_rev, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed8u, kmp_uint64, sub_rev, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed8, kmp_int64, div_rev, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(fixed8u, kmp_uint64, div_rev, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(float4, kmp_real32, sub_rev, 32, -, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(float4, kmp_real32, div_rev, 32, /, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_div_rev_fp
+
+ATOMIC_CMPXCHG_REV_MIX(float8, kmp_real64, sub_rev, 64, -, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_sub_rev_fp
+ATOMIC_CMPXCHG_REV_MIX(float8, kmp_real64, div_rev, 64, /, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_div_rev_fp
+
+ATOMIC_CRITICAL_REV_FP(float10, long double, sub_rev, -, fp, _Quad, 10r,
+                       1) // __kmpc_atomic_float10_sub_rev_fp
+ATOMIC_CRITICAL_REV_FP(float10, long double, div_rev, /, fp, _Quad, 10r,
+                       1) // __kmpc_atomic_float10_div_rev_fp
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#endif // KMP_HAVE_QUAD
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// ------------------------------------------------------------------------
+// X86 or X86_64: no alignment problems ====================================
+#if USE_CMPXCHG_FIX
+// workaround for C78287 (complex(kind=4) data type)
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
+                             LCK_ID, MASK, GOMP_FLAG)                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_CMPXCHG_WORKAROUND(TYPE, BITS, OP)                                        \
+  }
+// end of the second part of the workaround for C78287
+#else
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
+                             LCK_ID, MASK, GOMP_FLAG)                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  OP_CMPXCHG(TYPE, BITS, OP)                                                   \
+  }
+#endif // USE_CMPXCHG_FIX
+#else
+// ------------------------------------------------------------------------
+// Code for other architectures that don't handle unaligned accesses.
+#define ATOMIC_CMPXCHG_CMPLX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID, RTYPE,  \
+                             LCK_ID, MASK, GOMP_FLAG)                          \
+  ATOMIC_BEGIN_MIX(TYPE_ID, TYPE, OP_ID, RTYPE_ID, RTYPE)                      \
+  OP_GOMP_CRITICAL(OP## =, GOMP_FLAG)                                          \
+  if (!((kmp_uintptr_t)lhs & 0x##MASK)) {                                      \
+    OP_CMPXCHG(TYPE, BITS, OP) /* aligned address */                           \
+  } else {                                                                     \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL(OP## =, LCK_ID) /* unaligned address - use critical */         \
+  }                                                                            \
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, add, 64, +, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_add_cmplx8
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, sub, 64, -, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_sub_cmplx8
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, mul, 64, *, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_mul_cmplx8
+ATOMIC_CMPXCHG_CMPLX(cmplx4, kmp_cmplx32, div, 64, /, cmplx8, kmp_cmplx64, 8c,
+                     7, KMP_ARCH_X86) // __kmpc_atomic_cmplx4_div_cmplx8
+
+// READ, WRITE, CAPTURE are supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// ------------------------------------------------------------------------
+// Atomic READ routines
+
+// ------------------------------------------------------------------------
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, RET_TYPE)                      \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid,        \
+                                             TYPE *loc) {                      \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store_ret" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+// TODO: check if it is still necessary
+// Return old value regardless of the result of "compare & swap# operation
+#define OP_CMPXCHG_READ(TYPE, BITS, OP)                                        \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    union f_i_union {                                                          \
+      TYPE f_val;                                                              \
+      kmp_int##BITS i_val;                                                     \
+    };                                                                         \
+    union f_i_union old_value;                                                 \
+    temp_val = *loc;                                                           \
+    old_value.f_val = temp_val;                                                \
+    old_value.i_val = KMP_COMPARE_AND_STORE_RET##BITS(                         \
+        (kmp_int##BITS *)loc,                                                  \
+        *VOLATILE_CAST(kmp_int##BITS *) & old_value.i_val,                     \
+        *VOLATILE_CAST(kmp_int##BITS *) & old_value.i_val);                    \
+    new_value = old_value.f_val;                                               \
+    return new_value;                                                          \
+  }
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_READ(OP, LCK_ID)                                           \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  new_value = (*loc);                                                          \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_READ(OP, FLAG)                                        \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_READ(OP, 0);                                                   \
+    return new_value;                                                          \
+  }
+#else
+#define OP_GOMP_CRITICAL_READ(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define ATOMIC_FIXED_READ(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)           \
+  ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, TYPE)                                \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_READ(OP## =, GOMP_FLAG)                                     \
+  new_value = KMP_TEST_THEN_ADD##BITS(loc, OP 0);                              \
+  return new_value;                                                            \
+  }
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_READ(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)         \
+  ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, TYPE)                                \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_READ(OP## =, GOMP_FLAG)                                     \
+  OP_CMPXCHG_READ(TYPE, BITS, OP)                                              \
+  }
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_READ(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)      \
+  ATOMIC_BEGIN_READ(TYPE_ID, OP_ID, TYPE, TYPE)                                \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_READ(OP## =, GOMP_FLAG) /* send assignment */               \
+  OP_CRITICAL_READ(OP, LCK_ID) /* send assignment */                           \
+  return new_value;                                                            \
+  }
+
+// ------------------------------------------------------------------------
+// Fix for cmplx4 read (CQ220361) on Windows* OS. Regular routine with return
+// value doesn't work.
+// Let's return the read value through the additional parameter.
+#if (KMP_OS_WINDOWS)
+
+#define OP_CRITICAL_READ_WRK(OP, LCK_ID)                                       \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  (*out) = (*loc);                                                             \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_READ_WRK(OP, FLAG)                                    \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_READ_WRK(OP, 0);                                               \
+  }
+#else
+#define OP_GOMP_CRITICAL_READ_WRK(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+#define ATOMIC_BEGIN_READ_WRK(TYPE_ID, OP_ID, TYPE)                            \
+  void __kmpc_atomic_##TYPE_ID##_##OP_ID(TYPE *out, ident_t *id_ref, int gtid, \
+                                         TYPE *loc) {                          \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// ------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_READ_WRK(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)  \
+  ATOMIC_BEGIN_READ_WRK(TYPE_ID, OP_ID, TYPE)                                  \
+  OP_GOMP_CRITICAL_READ_WRK(OP## =, GOMP_FLAG) /* send assignment */           \
+  OP_CRITICAL_READ_WRK(OP, LCK_ID) /* send assignment */                       \
+  }
+
+#endif // KMP_OS_WINDOWS
+
+// ------------------------------------------------------------------------
+//                  TYPE_ID,OP_ID, TYPE,      OP, GOMP_FLAG
+ATOMIC_FIXED_READ(fixed4, rd, kmp_int32, 32, +, 0) // __kmpc_atomic_fixed4_rd
+ATOMIC_FIXED_READ(fixed8, rd, kmp_int64, 64, +,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_rd
+ATOMIC_CMPXCHG_READ(float4, rd, kmp_real32, 32, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_float4_rd
+ATOMIC_CMPXCHG_READ(float8, rd, kmp_real64, 64, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_float8_rd
+
+// !!! TODO: Remove lock operations for "char" since it can't be non-atomic
+ATOMIC_CMPXCHG_READ(fixed1, rd, kmp_int8, 8, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed1_rd
+ATOMIC_CMPXCHG_READ(fixed2, rd, kmp_int16, 16, +,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed2_rd
+
+ATOMIC_CRITICAL_READ(float10, rd, long double, +, 10r,
+                     1) // __kmpc_atomic_float10_rd
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_READ(float16, rd, QUAD_LEGACY, +, 16r,
+                     1) // __kmpc_atomic_float16_rd
+#endif // KMP_HAVE_QUAD
+
+// Fix for CQ220361 on Windows* OS
+#if (KMP_OS_WINDOWS)
+ATOMIC_CRITICAL_READ_WRK(cmplx4, rd, kmp_cmplx32, +, 8c,
+                         1) // __kmpc_atomic_cmplx4_rd
+#else
+ATOMIC_CRITICAL_READ(cmplx4, rd, kmp_cmplx32, +, 8c,
+                     1) // __kmpc_atomic_cmplx4_rd
+#endif // (KMP_OS_WINDOWS)
+ATOMIC_CRITICAL_READ(cmplx8, rd, kmp_cmplx64, +, 16c,
+                     1) // __kmpc_atomic_cmplx8_rd
+ATOMIC_CRITICAL_READ(cmplx10, rd, kmp_cmplx80, +, 20c,
+                     1) // __kmpc_atomic_cmplx10_rd
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_READ(cmplx16, rd, CPLX128_LEG, +, 32c,
+                     1) // __kmpc_atomic_cmplx16_rd
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_READ(float16, a16_rd, Quad_a16_t, +, 16r,
+                     1) // __kmpc_atomic_float16_a16_rd
+ATOMIC_CRITICAL_READ(cmplx16, a16_rd, kmp_cmplx128_a16_t, +, 32c,
+                     1) // __kmpc_atomic_cmplx16_a16_rd
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+// Atomic WRITE routines
+
+#define ATOMIC_XCHG_WR(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)              \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG)                                              \
+  KMP_XCHG_FIXED##BITS(lhs, rhs);                                              \
+  }
+// ------------------------------------------------------------------------
+#define ATOMIC_XCHG_FLOAT_WR(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG)                                              \
+  KMP_XCHG_REAL##BITS(lhs, rhs);                                               \
+  }
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_WR(TYPE, BITS, OP)                                          \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = rhs;                                                           \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_CPU_PAUSE();                                                         \
+                                                                               \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = rhs;                                                         \
+    }                                                                          \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_WR(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)           \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG)                                              \
+  OP_CMPXCHG_WR(TYPE, BITS, OP)                                                \
+  }
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_WR(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)        \
+  ATOMIC_BEGIN(TYPE_ID, OP_ID, TYPE, void)                                     \
+  OP_GOMP_CRITICAL(OP, GOMP_FLAG) /* send assignment */                        \
+  OP_CRITICAL(OP, LCK_ID) /* send assignment */                                \
+  }
+// -------------------------------------------------------------------------
+
+ATOMIC_XCHG_WR(fixed1, wr, kmp_int8, 8, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed1_wr
+ATOMIC_XCHG_WR(fixed2, wr, kmp_int16, 16, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed2_wr
+ATOMIC_XCHG_WR(fixed4, wr, kmp_int32, 32, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed4_wr
+#if (KMP_ARCH_X86)
+ATOMIC_CMPXCHG_WR(fixed8, wr, kmp_int64, 64, =,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_wr
+#else
+ATOMIC_XCHG_WR(fixed8, wr, kmp_int64, 64, =,
+               KMP_ARCH_X86) // __kmpc_atomic_fixed8_wr
+#endif // (KMP_ARCH_X86)
+
+ATOMIC_XCHG_FLOAT_WR(float4, wr, kmp_real32, 32, =,
+                     KMP_ARCH_X86) // __kmpc_atomic_float4_wr
+#if (KMP_ARCH_X86)
+ATOMIC_CMPXCHG_WR(float8, wr, kmp_real64, 64, =,
+                  KMP_ARCH_X86) // __kmpc_atomic_float8_wr
+#else
+ATOMIC_XCHG_FLOAT_WR(float8, wr, kmp_real64, 64, =,
+                     KMP_ARCH_X86) // __kmpc_atomic_float8_wr
+#endif // (KMP_ARCH_X86)
+
+ATOMIC_CRITICAL_WR(float10, wr, long double, =, 10r,
+                   1) // __kmpc_atomic_float10_wr
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR(float16, wr, QUAD_LEGACY, =, 16r,
+                   1) // __kmpc_atomic_float16_wr
+#endif // KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR(cmplx4, wr, kmp_cmplx32, =, 8c, 1) // __kmpc_atomic_cmplx4_wr
+ATOMIC_CRITICAL_WR(cmplx8, wr, kmp_cmplx64, =, 16c,
+                   1) // __kmpc_atomic_cmplx8_wr
+ATOMIC_CRITICAL_WR(cmplx10, wr, kmp_cmplx80, =, 20c,
+                   1) // __kmpc_atomic_cmplx10_wr
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_WR(cmplx16, wr, CPLX128_LEG, =, 32c,
+                   1) // __kmpc_atomic_cmplx16_wr
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_WR(float16, a16_wr, Quad_a16_t, =, 16r,
+                   1) // __kmpc_atomic_float16_a16_wr
+ATOMIC_CRITICAL_WR(cmplx16, a16_wr, kmp_cmplx128_a16_t, =, 32c,
+                   1) // __kmpc_atomic_cmplx16_a16_wr
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+// Atomic CAPTURE routines
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, RET_TYPE)                       \
+  RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid,        \
+                                             TYPE *lhs, TYPE rhs, int flag) {  \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_CPT(OP, LCK_ID)                                            \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) OP rhs;                                                             \
+    new_value = (*lhs);                                                        \
+  } else {                                                                     \
+    new_value = (*lhs);                                                        \
+    (*lhs) OP rhs;                                                             \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT(OP, FLAG)                                         \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT(OP## =, 0);                                                \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_CPT(TYPE, BITS, OP)                                         \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = old_value OP rhs;                                              \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_CPU_PAUSE();                                                         \
+                                                                               \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = old_value OP rhs;                                            \
+    }                                                                          \
+    if (flag) {                                                                \
+      return new_value;                                                        \
+    } else                                                                     \
+      return old_value;                                                        \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)          \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG)                                          \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_FIXED_ADD_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE old_value, new_value;                                                   \
+  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG)                                          \
+  /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */            \
+  old_value = KMP_TEST_THEN_ADD##BITS(lhs, OP rhs);                            \
+  if (flag) {                                                                  \
+    return old_value OP rhs;                                                   \
+  } else                                                                       \
+    return old_value;                                                          \
+  }
+// -------------------------------------------------------------------------
+
+ATOMIC_FIXED_ADD_CPT(fixed4, add_cpt, kmp_int32, 32, +,
+                     0) // __kmpc_atomic_fixed4_add_cpt
+ATOMIC_FIXED_ADD_CPT(fixed4, sub_cpt, kmp_int32, 32, -,
+                     0) // __kmpc_atomic_fixed4_sub_cpt
+ATOMIC_FIXED_ADD_CPT(fixed8, add_cpt, kmp_int64, 64, +,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_add_cpt
+ATOMIC_FIXED_ADD_CPT(fixed8, sub_cpt, kmp_int64, 64, -,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt
+
+ATOMIC_CMPXCHG_CPT(float4, add_cpt, kmp_real32, 32, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_add_cpt
+ATOMIC_CMPXCHG_CPT(float4, sub_cpt, kmp_real32, 32, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt
+ATOMIC_CMPXCHG_CPT(float8, add_cpt, kmp_real64, 64, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_add_cpt
+ATOMIC_CMPXCHG_CPT(float8, sub_cpt, kmp_real64, 64, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt
+
+// ------------------------------------------------------------------------
+// Entries definition for integer operands
+//     TYPE_ID - operands type and size (fixed4, float4)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operand type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator (used in critical section)
+//               TYPE_ID,OP_ID,  TYPE,   BITS,OP,GOMP_FLAG
+// ------------------------------------------------------------------------
+// Routines for ATOMIC integer operands, other operators
+// ------------------------------------------------------------------------
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+ATOMIC_CMPXCHG_CPT(fixed1, add_cpt, kmp_int8, 8, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_add_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, andb_cpt, kmp_int8, 8, &,
+                   0) // __kmpc_atomic_fixed1_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, div_cpt, kmp_int8, 8, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed1u, div_cpt, kmp_uint8, 8, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, mul_cpt, kmp_int8, 8, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, orb_cpt, kmp_int8, 8, |,
+                   0) // __kmpc_atomic_fixed1_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, shl_cpt, kmp_int8, 8, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, shr_cpt, kmp_int8, 8, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed1u, shr_cpt, kmp_uint8, 8, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, sub_cpt, kmp_int8, 8, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt
+ATOMIC_CMPXCHG_CPT(fixed1, xor_cpt, kmp_int8, 8, ^,
+                   0) // __kmpc_atomic_fixed1_xor_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, add_cpt, kmp_int16, 16, +,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_add_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, andb_cpt, kmp_int16, 16, &,
+                   0) // __kmpc_atomic_fixed2_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, div_cpt, kmp_int16, 16, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed2u, div_cpt, kmp_uint16, 16, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, mul_cpt, kmp_int16, 16, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, orb_cpt, kmp_int16, 16, |,
+                   0) // __kmpc_atomic_fixed2_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, shl_cpt, kmp_int16, 16, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, shr_cpt, kmp_int16, 16, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed2u, shr_cpt, kmp_uint16, 16, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, sub_cpt, kmp_int16, 16, -,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, xor_cpt, kmp_int16, 16, ^,
+                   0) // __kmpc_atomic_fixed2_xor_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, andb_cpt, kmp_int32, 32, &,
+                   0) // __kmpc_atomic_fixed4_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, div_cpt, kmp_int32, 32, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed4u, div_cpt, kmp_uint32, 32, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, mul_cpt, kmp_int32, 32, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, orb_cpt, kmp_int32, 32, |,
+                   0) // __kmpc_atomic_fixed4_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, shl_cpt, kmp_int32, 32, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, shr_cpt, kmp_int32, 32, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed4u, shr_cpt, kmp_uint32, 32, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, xor_cpt, kmp_int32, 32, ^,
+                   0) // __kmpc_atomic_fixed4_xor_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, andb_cpt, kmp_int64, 64, &,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_andb_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, div_cpt, kmp_int64, 64, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed8u, div_cpt, kmp_uint64, 64, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, mul_cpt, kmp_int64, 64, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, orb_cpt, kmp_int64, 64, |,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_orb_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, shl_cpt, kmp_int64, 64, <<,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, shr_cpt, kmp_int64, 64, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed8u, shr_cpt, kmp_uint64, 64, >>,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, xor_cpt, kmp_int64, 64, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_xor_cpt
+ATOMIC_CMPXCHG_CPT(float4, div_cpt, kmp_real32, 32, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt
+ATOMIC_CMPXCHG_CPT(float4, mul_cpt, kmp_real32, 32, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_float4_mul_cpt
+ATOMIC_CMPXCHG_CPT(float8, div_cpt, kmp_real64, 64, /,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt
+ATOMIC_CMPXCHG_CPT(float8, mul_cpt, kmp_real64, 64, *,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_mul_cpt
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+
+// CAPTURE routines for mixed types RHS=float16
+#if KMP_HAVE_QUAD
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+#define ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)            \
+  TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID##_##RTYPE_ID(                         \
+      ident_t *id_ref, int gtid, TYPE *lhs, RTYPE rhs, int flag) {             \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100,                                                              \
+             ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID "_" #RTYPE_ID ": T#%d\n",   \
+              gtid));
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID,       \
+                               RTYPE, LCK_ID, MASK, GOMP_FLAG)                 \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG)                                          \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_CPT_MIX(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE,     \
+                                LCK_ID, GOMP_FLAG)                             \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG) /* send assignment */                    \
+  OP_CRITICAL_CPT(OP## =, LCK_ID) /* send assignment */                        \
+  }
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, add_cpt, 8, +, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, add_cpt, 8, +, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, sub_cpt, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, sub_cpt, 8, -, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, mul_cpt, 8, *, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, mul_cpt, 8, *, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1, char, div_cpt, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed1u, uchar, div_cpt, 8, /, fp, _Quad, 1i, 0,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, add_cpt, 16, +, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, add_cpt, 16, +, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, sub_cpt, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, sub_cpt, 16, -, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, mul_cpt, 16, *, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, mul_cpt, 16, *, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2, short, div_cpt, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed2u, ushort, div_cpt, 16, /, fp, _Quad, 2i, 1,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, add_cpt, 32, +, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, add_cpt, 32, +, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, sub_cpt, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, sub_cpt, 32, -, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, mul_cpt, 32, *, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, mul_cpt, 32, *, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4, kmp_int32, div_cpt, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed4u, kmp_uint32, div_cpt, 32, /, fp, _Quad, 4i, 3,
+                       0) // __kmpc_atomic_fixed4u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, add_cpt, 64, +, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, add_cpt, 64, +, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, sub_cpt, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, sub_cpt, 64, -, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, mul_cpt, 64, *, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, mul_cpt, 64, *, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8, kmp_int64, div_cpt, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(fixed8u, kmp_uint64, div_cpt, 64, /, fp, _Quad, 8i, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, add_cpt, 32, +, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, sub_cpt, 32, -, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, mul_cpt, 32, *, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float4, kmp_real32, div_cpt, 32, /, fp, _Quad, 4r, 3,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt_fp
+
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, add_cpt, 64, +, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_add_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, sub_cpt, 64, -, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, mul_cpt, 64, *, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_mul_cpt_fp
+ATOMIC_CMPXCHG_CPT_MIX(float8, kmp_real64, div_cpt, 64, /, fp, _Quad, 8r, 7,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt_fp
+
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, add_cpt, +, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_add_cpt_fp
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, sub_cpt, -, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_sub_cpt_fp
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, mul_cpt, *, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_mul_cpt_fp
+ATOMIC_CRITICAL_CPT_MIX(float10, long double, div_cpt, /, fp, _Quad, 10r,
+                        1) // __kmpc_atomic_float10_div_cpt_fp
+
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+// Routines for C/C++ Reduction operators && and ||
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_L_CPT(OP, LCK_ID)                                          \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    new_value OP rhs;                                                          \
+  } else                                                                       \
+    new_value = (*lhs);                                                        \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_L_CPT(OP, FLAG)                                       \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_L_CPT(OP, 0);                                                  \
+    return new_value;                                                          \
+  }
+#else
+#define OP_GOMP_CRITICAL_L_CPT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Need separate macros for &&, || because there is no combined assignment
+#define ATOMIC_CMPX_L_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)           \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_L_CPT(= *lhs OP, GOMP_FLAG)                                 \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+ATOMIC_CMPX_L_CPT(fixed1, andl_cpt, char, 8, &&,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed1_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed1, orl_cpt, char, 8, ||,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed1_orl_cpt
+ATOMIC_CMPX_L_CPT(fixed2, andl_cpt, short, 16, &&,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed2_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed2, orl_cpt, short, 16, ||,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed2_orl_cpt
+ATOMIC_CMPX_L_CPT(fixed4, andl_cpt, kmp_int32, 32, &&,
+                  0) // __kmpc_atomic_fixed4_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed4, orl_cpt, kmp_int32, 32, ||,
+                  0) // __kmpc_atomic_fixed4_orl_cpt
+ATOMIC_CMPX_L_CPT(fixed8, andl_cpt, kmp_int64, 64, &&,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_andl_cpt
+ATOMIC_CMPX_L_CPT(fixed8, orl_cpt, kmp_int64, 64, ||,
+                  KMP_ARCH_X86) // __kmpc_atomic_fixed8_orl_cpt
+
+// -------------------------------------------------------------------------
+// Routines for Fortran operators that matched no one in C:
+// MAX, MIN, .EQV., .NEQV.
+// Operators .AND., .OR. are covered by __kmpc_atomic_*_{andl,orl}_cpt
+// Intrinsics IAND, IOR, IEOR are covered by __kmpc_atomic_*_{andb,orb,xor}_cpt
+
+// -------------------------------------------------------------------------
+// MIN and MAX need separate macros
+// OP - operator to check if we need any actions?
+#define MIN_MAX_CRITSECT_CPT(OP, LCK_ID)                                       \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (*lhs OP rhs) { /* still need actions? */                                 \
+    old_value = *lhs;                                                          \
+    *lhs = rhs;                                                                \
+    if (flag)                                                                  \
+      new_value = rhs;                                                         \
+    else                                                                       \
+      new_value = old_value;                                                   \
+  } else {                                                                     \
+    new_value = *lhs;                                                          \
+  }                                                                            \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
+// -------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_MIN_MAX_CRITSECT_CPT(OP, FLAG)                                    \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    MIN_MAX_CRITSECT_CPT(OP, 0);                                               \
+  }
+#else
+#define GOMP_MIN_MAX_CRITSECT_CPT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// -------------------------------------------------------------------------
+#define MIN_MAX_CMPXCHG_CPT(TYPE, BITS, OP)                                    \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    /*TYPE old_value; */                                                       \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    while (old_value OP rhs && /* still need actions? */                       \
+           !KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+               (kmp_int##BITS *)lhs,                                           \
+               *VOLATILE_CAST(kmp_int##BITS *) & old_value,                    \
+               *VOLATILE_CAST(kmp_int##BITS *) & rhs)) {                       \
+      KMP_CPU_PAUSE();                                                         \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+    }                                                                          \
+    if (flag)                                                                  \
+      return rhs;                                                              \
+    else                                                                       \
+      return old_value;                                                        \
+  }
+
+// -------------------------------------------------------------------------
+// 1-byte, 2-byte operands - use critical section
+#define MIN_MAX_CRITICAL_CPT(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)      \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value, old_value;                                                   \
+  if (*lhs OP rhs) { /* need actions? */                                       \
+    GOMP_MIN_MAX_CRITSECT_CPT(OP, GOMP_FLAG)                                   \
+    MIN_MAX_CRITSECT_CPT(OP, LCK_ID)                                           \
+  }                                                                            \
+  return *lhs;                                                                 \
+  }
+
+#define MIN_MAX_COMPXCHG_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)        \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value, old_value;                                                   \
+  if (*lhs OP rhs) {                                                           \
+    GOMP_MIN_MAX_CRITSECT_CPT(OP, GOMP_FLAG)                                   \
+    MIN_MAX_CMPXCHG_CPT(TYPE, BITS, OP)                                        \
+  }                                                                            \
+  return *lhs;                                                                 \
+  }
+
+MIN_MAX_COMPXCHG_CPT(fixed1, max_cpt, char, 8, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed1_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed1, min_cpt, char, 8, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed1_min_cpt
+MIN_MAX_COMPXCHG_CPT(fixed2, max_cpt, short, 16, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed2_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed2, min_cpt, short, 16, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed2_min_cpt
+MIN_MAX_COMPXCHG_CPT(fixed4, max_cpt, kmp_int32, 32, <,
+                     0) // __kmpc_atomic_fixed4_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed4, min_cpt, kmp_int32, 32, >,
+                     0) // __kmpc_atomic_fixed4_min_cpt
+MIN_MAX_COMPXCHG_CPT(fixed8, max_cpt, kmp_int64, 64, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_max_cpt
+MIN_MAX_COMPXCHG_CPT(fixed8, min_cpt, kmp_int64, 64, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_fixed8_min_cpt
+MIN_MAX_COMPXCHG_CPT(float4, max_cpt, kmp_real32, 32, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_float4_max_cpt
+MIN_MAX_COMPXCHG_CPT(float4, min_cpt, kmp_real32, 32, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_float4_min_cpt
+MIN_MAX_COMPXCHG_CPT(float8, max_cpt, kmp_real64, 64, <,
+                     KMP_ARCH_X86) // __kmpc_atomic_float8_max_cpt
+MIN_MAX_COMPXCHG_CPT(float8, min_cpt, kmp_real64, 64, >,
+                     KMP_ARCH_X86) // __kmpc_atomic_float8_min_cpt
+#if KMP_HAVE_QUAD
+MIN_MAX_CRITICAL_CPT(float16, max_cpt, QUAD_LEGACY, <, 16r,
+                     1) // __kmpc_atomic_float16_max_cpt
+MIN_MAX_CRITICAL_CPT(float16, min_cpt, QUAD_LEGACY, >, 16r,
+                     1) // __kmpc_atomic_float16_min_cpt
+#if (KMP_ARCH_X86)
+MIN_MAX_CRITICAL_CPT(float16, max_a16_cpt, Quad_a16_t, <, 16r,
+                     1) // __kmpc_atomic_float16_max_a16_cpt
+MIN_MAX_CRITICAL_CPT(float16, min_a16_cpt, Quad_a16_t, >, 16r,
+                     1) // __kmpc_atomic_float16_mix_a16_cpt
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_EQV_CPT(OP, FLAG)                                     \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT(OP, 0);                                                    \
+  }
+#else
+#define OP_GOMP_CRITICAL_EQV_CPT(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+#define ATOMIC_CMPX_EQV_CPT(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)         \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_EQV_CPT(^= ~, GOMP_FLAG) /* send assignment */              \
+  OP_CMPXCHG_CPT(TYPE, BITS, OP)                                               \
+  }
+
+// ------------------------------------------------------------------------
+
+ATOMIC_CMPXCHG_CPT(fixed1, neqv_cpt, kmp_int8, 8, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed1_neqv_cpt
+ATOMIC_CMPXCHG_CPT(fixed2, neqv_cpt, kmp_int16, 16, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed2_neqv_cpt
+ATOMIC_CMPXCHG_CPT(fixed4, neqv_cpt, kmp_int32, 32, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed4_neqv_cpt
+ATOMIC_CMPXCHG_CPT(fixed8, neqv_cpt, kmp_int64, 64, ^,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_neqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed1, eqv_cpt, kmp_int8, 8, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed1_eqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed2, eqv_cpt, kmp_int16, 16, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed2_eqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed4, eqv_cpt, kmp_int32, 32, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed4_eqv_cpt
+ATOMIC_CMPX_EQV_CPT(fixed8, eqv_cpt, kmp_int64, 64, ^~,
+                    KMP_ARCH_X86) // __kmpc_atomic_fixed8_eqv_cpt
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_CPT(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)       \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT(OP, GOMP_FLAG) /* send assignment */                    \
+  OP_CRITICAL_CPT(OP## =, LCK_ID) /* send assignment */                        \
+  }
+
+// ------------------------------------------------------------------------
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+#define OP_CRITICAL_CPT_WRK(OP, LCK_ID)                                        \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) OP rhs;                                                             \
+    (*out) = (*lhs);                                                           \
+  } else {                                                                     \
+    (*out) = (*lhs);                                                           \
+    (*lhs) OP rhs;                                                             \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_WRK(OP, FLAG)                                     \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT_WRK(OP## =, 0);                                            \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT_WRK(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_BEGIN_WRK(TYPE_ID, OP_ID, TYPE)                                 \
+  void __kmpc_atomic_##TYPE_ID##_##OP_ID(ident_t *id_ref, int gtid, TYPE *lhs, \
+                                         TYPE rhs, TYPE *out, int flag) {      \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_" #OP_ID ": T#%d\n", gtid));
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_CPT_WRK(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)   \
+  ATOMIC_BEGIN_WRK(TYPE_ID, OP_ID, TYPE)                                       \
+  OP_GOMP_CRITICAL_CPT_WRK(OP, GOMP_FLAG)                                      \
+  OP_CRITICAL_CPT_WRK(OP## =, LCK_ID)                                          \
+  }
+// The end of workaround for cmplx4
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_CPT(float10, add_cpt, long double, +, 10r,
+                    1) // __kmpc_atomic_float10_add_cpt
+ATOMIC_CRITICAL_CPT(float10, sub_cpt, long double, -, 10r,
+                    1) // __kmpc_atomic_float10_sub_cpt
+ATOMIC_CRITICAL_CPT(float10, mul_cpt, long double, *, 10r,
+                    1) // __kmpc_atomic_float10_mul_cpt
+ATOMIC_CRITICAL_CPT(float10, div_cpt, long double, /, 10r,
+                    1) // __kmpc_atomic_float10_div_cpt
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_CPT(float16, add_cpt, QUAD_LEGACY, +, 16r,
+                    1) // __kmpc_atomic_float16_add_cpt
+ATOMIC_CRITICAL_CPT(float16, sub_cpt, QUAD_LEGACY, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_cpt
+ATOMIC_CRITICAL_CPT(float16, mul_cpt, QUAD_LEGACY, *, 16r,
+                    1) // __kmpc_atomic_float16_mul_cpt
+ATOMIC_CRITICAL_CPT(float16, div_cpt, QUAD_LEGACY, /, 16r,
+                    1) // __kmpc_atomic_float16_div_cpt
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT(float16, add_a16_cpt, Quad_a16_t, +, 16r,
+                    1) // __kmpc_atomic_float16_add_a16_cpt
+ATOMIC_CRITICAL_CPT(float16, sub_a16_cpt, Quad_a16_t, -, 16r,
+                    1) // __kmpc_atomic_float16_sub_a16_cpt
+ATOMIC_CRITICAL_CPT(float16, mul_a16_cpt, Quad_a16_t, *, 16r,
+                    1) // __kmpc_atomic_float16_mul_a16_cpt
+ATOMIC_CRITICAL_CPT(float16, div_a16_cpt, Quad_a16_t, /, 16r,
+                    1) // __kmpc_atomic_float16_div_a16_cpt
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// routines for complex types
+
+// cmplx4 routines to return void
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, add_cpt, kmp_cmplx32, +, 8c,
+                        1) // __kmpc_atomic_cmplx4_add_cpt
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, sub_cpt, kmp_cmplx32, -, 8c,
+                        1) // __kmpc_atomic_cmplx4_sub_cpt
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, mul_cpt, kmp_cmplx32, *, 8c,
+                        1) // __kmpc_atomic_cmplx4_mul_cpt
+ATOMIC_CRITICAL_CPT_WRK(cmplx4, div_cpt, kmp_cmplx32, /, 8c,
+                        1) // __kmpc_atomic_cmplx4_div_cpt
+
+ATOMIC_CRITICAL_CPT(cmplx8, add_cpt, kmp_cmplx64, +, 16c,
+                    1) // __kmpc_atomic_cmplx8_add_cpt
+ATOMIC_CRITICAL_CPT(cmplx8, sub_cpt, kmp_cmplx64, -, 16c,
+                    1) // __kmpc_atomic_cmplx8_sub_cpt
+ATOMIC_CRITICAL_CPT(cmplx8, mul_cpt, kmp_cmplx64, *, 16c,
+                    1) // __kmpc_atomic_cmplx8_mul_cpt
+ATOMIC_CRITICAL_CPT(cmplx8, div_cpt, kmp_cmplx64, /, 16c,
+                    1) // __kmpc_atomic_cmplx8_div_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, add_cpt, kmp_cmplx80, +, 20c,
+                    1) // __kmpc_atomic_cmplx10_add_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, sub_cpt, kmp_cmplx80, -, 20c,
+                    1) // __kmpc_atomic_cmplx10_sub_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, mul_cpt, kmp_cmplx80, *, 20c,
+                    1) // __kmpc_atomic_cmplx10_mul_cpt
+ATOMIC_CRITICAL_CPT(cmplx10, div_cpt, kmp_cmplx80, /, 20c,
+                    1) // __kmpc_atomic_cmplx10_div_cpt
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_CPT(cmplx16, add_cpt, CPLX128_LEG, +, 32c,
+                    1) // __kmpc_atomic_cmplx16_add_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, sub_cpt, CPLX128_LEG, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, mul_cpt, CPLX128_LEG, *, 32c,
+                    1) // __kmpc_atomic_cmplx16_mul_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, div_cpt, CPLX128_LEG, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_cpt
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT(cmplx16, add_a16_cpt, kmp_cmplx128_a16_t, +, 32c,
+                    1) // __kmpc_atomic_cmplx16_add_a16_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, sub_a16_cpt, kmp_cmplx128_a16_t, -, 32c,
+                    1) // __kmpc_atomic_cmplx16_sub_a16_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, mul_a16_cpt, kmp_cmplx128_a16_t, *, 32c,
+                    1) // __kmpc_atomic_cmplx16_mul_a16_cpt
+ATOMIC_CRITICAL_CPT(cmplx16, div_a16_cpt, kmp_cmplx128_a16_t, /, 32c,
+                    1) // __kmpc_atomic_cmplx16_div_a16_cpt
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// OpenMP 4.0: v = x = expr binop x; { v = x; x = expr binop x; } { x = expr
+// binop x; v = x; }  for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+
+// -------------------------------------------------------------------------
+// Operation on *lhs, rhs bound by critical section
+//     OP     - operator (it's supposed to contain an assignment)
+//     LCK_ID - lock identifier
+// Note: don't check gtid as it should always be valid
+// 1, 2-byte - expect valid parameter, other - check before this macro
+#define OP_CRITICAL_CPT_REV(OP, LCK_ID)                                        \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    /*temp_val = (*lhs);*/                                                     \
+    (*lhs) = (rhs)OP(*lhs);                                                    \
+    new_value = (*lhs);                                                        \
+  } else {                                                                     \
+    new_value = (*lhs);                                                        \
+    (*lhs) = (rhs)OP(*lhs);                                                    \
+  }                                                                            \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return new_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_REV(OP, FLAG)                                     \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT_REV(OP, 0);                                                \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT_REV(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+// ------------------------------------------------------------------------
+// Operation on *lhs, rhs using "compare_and_store" routine
+//     TYPE    - operands' type
+//     BITS    - size in bits, used to distinguish low level calls
+//     OP      - operator
+// Note: temp_val introduced in order to force the compiler to read
+//       *lhs only once (w/o it the compiler reads *lhs twice)
+#define OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                     \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = rhs OP old_value;                                              \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_CPU_PAUSE();                                                         \
+                                                                               \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = rhs OP old_value;                                            \
+    }                                                                          \
+    if (flag) {                                                                \
+      return new_value;                                                        \
+    } else                                                                     \
+      return old_value;                                                        \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT_REV(TYPE_ID, OP_ID, TYPE, BITS, OP, GOMP_FLAG)      \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
+  OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
+  }
+
+ATOMIC_CMPXCHG_CPT_REV(fixed1, div_cpt_rev, kmp_int8, 8, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1u, div_cpt_rev, kmp_uint8, 8, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1, shl_cpt_rev, kmp_int8, 8, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1, shr_cpt_rev, kmp_int8, 8, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1u, shr_cpt_rev, kmp_uint8, 8, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed1, sub_cpt_rev, kmp_int8, 8, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, div_cpt_rev, kmp_int16, 16, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2u, div_cpt_rev, kmp_uint16, 16, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, shl_cpt_rev, kmp_int16, 16, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, shr_cpt_rev, kmp_int16, 16, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2u, shr_cpt_rev, kmp_uint16, 16, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed2, sub_cpt_rev, kmp_int16, 16, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, div_cpt_rev, kmp_int32, 32, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4u, div_cpt_rev, kmp_uint32, 32, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, shl_cpt_rev, kmp_int32, 32, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, shr_cpt_rev, kmp_int32, 32, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4u, shr_cpt_rev, kmp_uint32, 32, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed4, sub_cpt_rev, kmp_int32, 32, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed4_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, div_cpt_rev, kmp_int64, 64, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8u, div_cpt_rev, kmp_uint64, 64, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, shl_cpt_rev, kmp_int64, 64, <<,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_shl_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, shr_cpt_rev, kmp_int64, 64, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8u, shr_cpt_rev, kmp_uint64, 64, >>,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8u_shr_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(fixed8, sub_cpt_rev, kmp_int64, 64, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float4, div_cpt_rev, kmp_real32, 32, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float4, sub_cpt_rev, kmp_real32, 32, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float8, div_cpt_rev, kmp_real64, 64, /,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt_rev
+ATOMIC_CMPXCHG_CPT_REV(float8, sub_cpt_rev, kmp_real64, 64, -,
+                       KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt_rev
+//              TYPE_ID,OP_ID, TYPE,          OP,  GOMP_FLAG
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+//     TYPE_ID, OP_ID, TYPE - detailed above
+//     OP      - operator
+//     LCK_ID  - lock identifier, used to possibly distinguish lock variable
+#define ATOMIC_CRITICAL_CPT_REV(TYPE_ID, OP_ID, TYPE, OP, LCK_ID, GOMP_FLAG)   \
+  ATOMIC_BEGIN_CPT(TYPE_ID, OP_ID, TYPE, TYPE)                                 \
+  TYPE new_value;                                                              \
+  /*printf("__kmp_atomic_mode = %d\n", __kmp_atomic_mode);*/                   \
+  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
+  OP_CRITICAL_CPT_REV(OP, LCK_ID)                                              \
+  }
+
+/* ------------------------------------------------------------------------- */
+// routines for long double type
+ATOMIC_CRITICAL_CPT_REV(float10, sub_cpt_rev, long double, -, 10r,
+                        1) // __kmpc_atomic_float10_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(float10, div_cpt_rev, long double, /, 10r,
+                        1) // __kmpc_atomic_float10_div_cpt_rev
+#if KMP_HAVE_QUAD
+// routines for _Quad type
+ATOMIC_CRITICAL_CPT_REV(float16, sub_cpt_rev, QUAD_LEGACY, -, 16r,
+                        1) // __kmpc_atomic_float16_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(float16, div_cpt_rev, QUAD_LEGACY, /, 16r,
+                        1) // __kmpc_atomic_float16_div_cpt_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT_REV(float16, sub_a16_cpt_rev, Quad_a16_t, -, 16r,
+                        1) // __kmpc_atomic_float16_sub_a16_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(float16, div_a16_cpt_rev, Quad_a16_t, /, 16r,
+                        1) // __kmpc_atomic_float16_div_a16_cpt_rev
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// routines for complex types
+
+// ------------------------------------------------------------------------
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+#define OP_CRITICAL_CPT_REV_WRK(OP, LCK_ID)                                    \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  if (flag) {                                                                  \
+    (*lhs) = (rhs)OP(*lhs);                                                    \
+    (*out) = (*lhs);                                                           \
+  } else {                                                                     \
+    (*out) = (*lhs);                                                           \
+    (*lhs) = (rhs)OP(*lhs);                                                    \
+  }                                                                            \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define OP_GOMP_CRITICAL_CPT_REV_WRK(OP, FLAG)                                 \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    OP_CRITICAL_CPT_REV_WRK(OP, 0);                                            \
+  }
+#else
+#define OP_GOMP_CRITICAL_CPT_REV_WRK(OP, FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_CPT_REV_WRK(TYPE_ID, OP_ID, TYPE, OP, LCK_ID,          \
+                                    GOMP_FLAG)                                 \
+  ATOMIC_BEGIN_WRK(TYPE_ID, OP_ID, TYPE)                                       \
+  OP_GOMP_CRITICAL_CPT_REV_WRK(OP, GOMP_FLAG)                                  \
+  OP_CRITICAL_CPT_REV_WRK(OP, LCK_ID)                                          \
+  }
+// The end of workaround for cmplx4
+
+// !!! TODO: check if we need to return void for cmplx4 routines
+// cmplx4 routines to return void
+ATOMIC_CRITICAL_CPT_REV_WRK(cmplx4, sub_cpt_rev, kmp_cmplx32, -, 8c,
+                            1) // __kmpc_atomic_cmplx4_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV_WRK(cmplx4, div_cpt_rev, kmp_cmplx32, /, 8c,
+                            1) // __kmpc_atomic_cmplx4_div_cpt_rev
+
+ATOMIC_CRITICAL_CPT_REV(cmplx8, sub_cpt_rev, kmp_cmplx64, -, 16c,
+                        1) // __kmpc_atomic_cmplx8_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx8, div_cpt_rev, kmp_cmplx64, /, 16c,
+                        1) // __kmpc_atomic_cmplx8_div_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx10, sub_cpt_rev, kmp_cmplx80, -, 20c,
+                        1) // __kmpc_atomic_cmplx10_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx10, div_cpt_rev, kmp_cmplx80, /, 20c,
+                        1) // __kmpc_atomic_cmplx10_div_cpt_rev
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_CPT_REV(cmplx16, sub_cpt_rev, CPLX128_LEG, -, 32c,
+                        1) // __kmpc_atomic_cmplx16_sub_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx16, div_cpt_rev, CPLX128_LEG, /, 32c,
+                        1) // __kmpc_atomic_cmplx16_div_cpt_rev
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_CPT_REV(cmplx16, sub_a16_cpt_rev, kmp_cmplx128_a16_t, -, 32c,
+                        1) // __kmpc_atomic_cmplx16_sub_a16_cpt_rev
+ATOMIC_CRITICAL_CPT_REV(cmplx16, div_a16_cpt_rev, kmp_cmplx128_a16_t, /, 32c,
+                        1) // __kmpc_atomic_cmplx16_div_a16_cpt_rev
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// Capture reverse for mixed type: RHS=float16
+#if KMP_HAVE_QUAD
+
+// Beginning of a definition (provides name, parameters, gebug trace)
+//     TYPE_ID - operands type and size (fixed*, fixed*u for signed, unsigned
+//     fixed)
+//     OP_ID   - operation identifier (add, sub, mul, ...)
+//     TYPE    - operands' type
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_CPT_REV_MIX(TYPE_ID, TYPE, OP_ID, BITS, OP, RTYPE_ID,   \
+                                   RTYPE, LCK_ID, MASK, GOMP_FLAG)             \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG)                                      \
+  OP_CMPXCHG_CPT_REV(TYPE, BITS, OP)                                           \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CRITICAL_CPT_REV_MIX(TYPE_ID, TYPE, OP_ID, OP, RTYPE_ID, RTYPE, \
+                                    LCK_ID, GOMP_FLAG)                         \
+  ATOMIC_BEGIN_CPT_MIX(TYPE_ID, OP_ID, TYPE, RTYPE_ID, RTYPE)                  \
+  TYPE new_value;                                                              \
+  OP_GOMP_CRITICAL_CPT_REV(OP, GOMP_FLAG) /* send assignment */                \
+  OP_CRITICAL_CPT_REV(OP, LCK_ID) /* send assignment */                        \
+  }
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1, char, sub_cpt_rev, 8, -, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1u, uchar, sub_cpt_rev, 8, -, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1, char, div_cpt_rev, 8, /, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed1u, uchar, div_cpt_rev, 8, /, fp, _Quad, 1i, 0,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed1u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2, short, sub_cpt_rev, 16, -, fp, _Quad, 2i, 1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2u, ushort, sub_cpt_rev, 16, -, fp, _Quad, 2i,
+                           1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2, short, div_cpt_rev, 16, /, fp, _Quad, 2i, 1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed2u, ushort, div_cpt_rev, 16, /, fp, _Quad, 2i,
+                           1,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed2u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4, kmp_int32, sub_cpt_rev, 32, -, fp, _Quad, 4i,
+                           3, 0) // __kmpc_atomic_fixed4_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4u, kmp_uint32, sub_cpt_rev, 32, -, fp, _Quad,
+                           4i, 3, 0) // __kmpc_atomic_fixed4u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4, kmp_int32, div_cpt_rev, 32, /, fp, _Quad, 4i,
+                           3, 0) // __kmpc_atomic_fixed4_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed4u, kmp_uint32, div_cpt_rev, 32, /, fp, _Quad,
+                           4i, 3, 0) // __kmpc_atomic_fixed4u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8, kmp_int64, sub_cpt_rev, 64, -, fp, _Quad, 8i,
+                           7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8u, kmp_uint64, sub_cpt_rev, 64, -, fp, _Quad,
+                           8i, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8u_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8, kmp_int64, div_cpt_rev, 64, /, fp, _Quad, 8i,
+                           7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8_div_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(fixed8u, kmp_uint64, div_cpt_rev, 64, /, fp, _Quad,
+                           8i, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_fixed8u_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(float4, kmp_real32, sub_cpt_rev, 32, -, fp, _Quad,
+                           4r, 3,
+                           KMP_ARCH_X86) // __kmpc_atomic_float4_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(float4, kmp_real32, div_cpt_rev, 32, /, fp, _Quad,
+                           4r, 3,
+                           KMP_ARCH_X86) // __kmpc_atomic_float4_div_cpt_rev_fp
+
+ATOMIC_CMPXCHG_CPT_REV_MIX(float8, kmp_real64, sub_cpt_rev, 64, -, fp, _Quad,
+                           8r, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_float8_sub_cpt_rev_fp
+ATOMIC_CMPXCHG_CPT_REV_MIX(float8, kmp_real64, div_cpt_rev, 64, /, fp, _Quad,
+                           8r, 7,
+                           KMP_ARCH_X86) // __kmpc_atomic_float8_div_cpt_rev_fp
+
+ATOMIC_CRITICAL_CPT_REV_MIX(float10, long double, sub_cpt_rev, -, fp, _Quad,
+                            10r, 1) // __kmpc_atomic_float10_sub_cpt_rev_fp
+ATOMIC_CRITICAL_CPT_REV_MIX(float10, long double, div_cpt_rev, /, fp, _Quad,
+                            10r, 1) // __kmpc_atomic_float10_div_cpt_rev_fp
+
+#endif // KMP_HAVE_QUAD
+
+//   OpenMP 4.0 Capture-write (swap): {v = x; x = expr;}
+
+#define ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                        \
+  TYPE __kmpc_atomic_##TYPE_ID##_swp(ident_t *id_ref, int gtid, TYPE *lhs,     \
+                                     TYPE rhs) {                               \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_swp: T#%d\n", gtid));
+
+#define CRITICAL_SWP(LCK_ID)                                                   \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  old_value = (*lhs);                                                          \
+  (*lhs) = rhs;                                                                \
+                                                                               \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return old_value;
+
+// ------------------------------------------------------------------------
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_CRITICAL_SWP(FLAG)                                                \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    CRITICAL_SWP(0);                                                           \
+  }
+#else
+#define GOMP_CRITICAL_SWP(FLAG)
+#endif /* KMP_GOMP_COMPAT */
+
+#define ATOMIC_XCHG_SWP(TYPE_ID, TYPE, BITS, GOMP_FLAG)                        \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  old_value = KMP_XCHG_FIXED##BITS(lhs, rhs);                                  \
+  return old_value;                                                            \
+  }
+// ------------------------------------------------------------------------
+#define ATOMIC_XCHG_FLOAT_SWP(TYPE_ID, TYPE, BITS, GOMP_FLAG)                  \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  old_value = KMP_XCHG_REAL##BITS(lhs, rhs);                                   \
+  return old_value;                                                            \
+  }
+
+// ------------------------------------------------------------------------
+#define CMPXCHG_SWP(TYPE, BITS)                                                \
+  {                                                                            \
+    TYPE KMP_ATOMIC_VOLATILE temp_val;                                         \
+    TYPE old_value, new_value;                                                 \
+    temp_val = *lhs;                                                           \
+    old_value = temp_val;                                                      \
+    new_value = rhs;                                                           \
+    while (!KMP_COMPARE_AND_STORE_ACQ##BITS(                                   \
+        (kmp_int##BITS *)lhs, *VOLATILE_CAST(kmp_int##BITS *) & old_value,     \
+        *VOLATILE_CAST(kmp_int##BITS *) & new_value)) {                        \
+      KMP_CPU_PAUSE();                                                         \
+                                                                               \
+      temp_val = *lhs;                                                         \
+      old_value = temp_val;                                                    \
+      new_value = rhs;                                                         \
+    }                                                                          \
+    return old_value;                                                          \
+  }
+
+// -------------------------------------------------------------------------
+#define ATOMIC_CMPXCHG_SWP(TYPE_ID, TYPE, BITS, GOMP_FLAG)                     \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  CMPXCHG_SWP(TYPE, BITS)                                                      \
+  }
+
+ATOMIC_XCHG_SWP(fixed1, kmp_int8, 8, KMP_ARCH_X86) // __kmpc_atomic_fixed1_swp
+ATOMIC_XCHG_SWP(fixed2, kmp_int16, 16, KMP_ARCH_X86) // __kmpc_atomic_fixed2_swp
+ATOMIC_XCHG_SWP(fixed4, kmp_int32, 32, KMP_ARCH_X86) // __kmpc_atomic_fixed4_swp
+
+ATOMIC_XCHG_FLOAT_SWP(float4, kmp_real32, 32,
+                      KMP_ARCH_X86) // __kmpc_atomic_float4_swp
+
+#if (KMP_ARCH_X86)
+ATOMIC_CMPXCHG_SWP(fixed8, kmp_int64, 64,
+                   KMP_ARCH_X86) // __kmpc_atomic_fixed8_swp
+ATOMIC_CMPXCHG_SWP(float8, kmp_real64, 64,
+                   KMP_ARCH_X86) // __kmpc_atomic_float8_swp
+#else
+ATOMIC_XCHG_SWP(fixed8, kmp_int64, 64, KMP_ARCH_X86) // __kmpc_atomic_fixed8_swp
+ATOMIC_XCHG_FLOAT_SWP(float8, kmp_real64, 64,
+                      KMP_ARCH_X86) // __kmpc_atomic_float8_swp
+#endif // (KMP_ARCH_X86)
+
+// ------------------------------------------------------------------------
+// Routines for Extended types: long double, _Quad, complex flavours (use
+// critical section)
+#define ATOMIC_CRITICAL_SWP(TYPE_ID, TYPE, LCK_ID, GOMP_FLAG)                  \
+  ATOMIC_BEGIN_SWP(TYPE_ID, TYPE)                                              \
+  TYPE old_value;                                                              \
+  GOMP_CRITICAL_SWP(GOMP_FLAG)                                                 \
+  CRITICAL_SWP(LCK_ID)                                                         \
+  }
+
+// ------------------------------------------------------------------------
+// !!! TODO: check if we need to return void for cmplx4 routines
+// Workaround for cmplx4. Regular routines with return value don't work
+// on Win_32e. Let's return captured values through the additional parameter.
+
+#define ATOMIC_BEGIN_SWP_WRK(TYPE_ID, TYPE)                                    \
+  void __kmpc_atomic_##TYPE_ID##_swp(ident_t *id_ref, int gtid, TYPE *lhs,     \
+                                     TYPE rhs, TYPE *out) {                    \
+    KMP_DEBUG_ASSERT(__kmp_init_serial);                                       \
+    KA_TRACE(100, ("__kmpc_atomic_" #TYPE_ID "_swp: T#%d\n", gtid));
+
+#define CRITICAL_SWP_WRK(LCK_ID)                                               \
+  __kmp_acquire_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+                                                                               \
+  tmp = (*lhs);                                                                \
+  (*lhs) = (rhs);                                                              \
+  (*out) = tmp;                                                                \
+  __kmp_release_atomic_lock(&ATOMIC_LOCK##LCK_ID, gtid);                       \
+  return;
+// ------------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+#define GOMP_CRITICAL_SWP_WRK(FLAG)                                            \
+  if ((FLAG) && (__kmp_atomic_mode == 2)) {                                    \
+    KMP_CHECK_GTID;                                                            \
+    CRITICAL_SWP_WRK(0);                                                       \
+  }
+#else
+#define GOMP_CRITICAL_SWP_WRK(FLAG)
+#endif /* KMP_GOMP_COMPAT */
+// ------------------------------------------------------------------------
+
+#define ATOMIC_CRITICAL_SWP_WRK(TYPE_ID, TYPE, LCK_ID, GOMP_FLAG)              \
+  ATOMIC_BEGIN_SWP_WRK(TYPE_ID, TYPE)                                          \
+  TYPE tmp;                                                                    \
+  GOMP_CRITICAL_SWP_WRK(GOMP_FLAG)                                             \
+  CRITICAL_SWP_WRK(LCK_ID)                                                     \
+  }
+// The end of workaround for cmplx4
+
+ATOMIC_CRITICAL_SWP(float10, long double, 10r, 1) // __kmpc_atomic_float10_swp
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_SWP(float16, QUAD_LEGACY, 16r, 1) // __kmpc_atomic_float16_swp
+#endif // KMP_HAVE_QUAD
+// cmplx4 routine to return void
+ATOMIC_CRITICAL_SWP_WRK(cmplx4, kmp_cmplx32, 8c, 1) // __kmpc_atomic_cmplx4_swp
+
+// ATOMIC_CRITICAL_SWP( cmplx4, kmp_cmplx32,  8c,   1 )           //
+// __kmpc_atomic_cmplx4_swp
+
+ATOMIC_CRITICAL_SWP(cmplx8, kmp_cmplx64, 16c, 1) // __kmpc_atomic_cmplx8_swp
+ATOMIC_CRITICAL_SWP(cmplx10, kmp_cmplx80, 20c, 1) // __kmpc_atomic_cmplx10_swp
+#if KMP_HAVE_QUAD
+ATOMIC_CRITICAL_SWP(cmplx16, CPLX128_LEG, 32c, 1) // __kmpc_atomic_cmplx16_swp
+#if (KMP_ARCH_X86)
+ATOMIC_CRITICAL_SWP(float16_a16, Quad_a16_t, 16r,
+                    1) // __kmpc_atomic_float16_a16_swp
+ATOMIC_CRITICAL_SWP(cmplx16_a16, kmp_cmplx128_a16_t, 32c,
+                    1) // __kmpc_atomic_cmplx16_a16_swp
+#endif // (KMP_ARCH_X86)
+#endif // KMP_HAVE_QUAD
+
+// End of OpenMP 4.0 Capture
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+#undef OP_CRITICAL
+
+/* ------------------------------------------------------------------------ */
+/* Generic atomic routines                                                  */
+
+void __kmpc_atomic_1(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      FALSE /* must use lock */
+#else
+      TRUE
+#endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      ) {
+    kmp_int8 old_value, new_value;
+
+    old_value = *(kmp_int8 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ8((kmp_int8 *)lhs, *(kmp_int8 *)&old_value,
+                                       *(kmp_int8 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int8 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+// All 1-byte data is of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_1i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_1i, gtid);
+  }
+}
+
+void __kmpc_atomic_2(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  if (
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      FALSE /* must use lock */
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+      TRUE /* no alignment problems */
+#else
+      !((kmp_uintptr_t)lhs & 0x1) /* make sure address is 2-byte aligned */
+#endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      ) {
+    kmp_int16 old_value, new_value;
+
+    old_value = *(kmp_int16 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ16(
+        (kmp_int16 *)lhs, *(kmp_int16 *)&old_value, *(kmp_int16 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int16 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+// All 2-byte data is of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_2i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_2i, gtid);
+  }
+}
+
+void __kmpc_atomic_4(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (
+// FIXME: On IA-32 architecture, gcc uses cmpxchg only for 4-byte ints.
+// Gomp compatibility is broken if this routine is called for floats.
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+      TRUE /* no alignment problems */
+#else
+      !((kmp_uintptr_t)lhs & 0x3) /* make sure address is 4-byte aligned */
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+      ) {
+    kmp_int32 old_value, new_value;
+
+    old_value = *(kmp_int32 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ32(
+        (kmp_int32 *)lhs, *(kmp_int32 *)&old_value, *(kmp_int32 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int32 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+// Use __kmp_atomic_lock_4i for all 4-byte data,
+// even if it isn't of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_4i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_4i, gtid);
+  }
+}
+
+void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (
+
+#if KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      FALSE /* must use lock */
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+      TRUE /* no alignment problems */
+#else
+      !((kmp_uintptr_t)lhs & 0x7) /* make sure address is 8-byte aligned */
+#endif // KMP_ARCH_X86 && defined(KMP_GOMP_COMPAT)
+      ) {
+    kmp_int64 old_value, new_value;
+
+    old_value = *(kmp_int64 *)lhs;
+    (*f)(&new_value, &old_value, rhs);
+    /* TODO: Should this be acquire or release? */
+    while (!KMP_COMPARE_AND_STORE_ACQ64(
+        (kmp_int64 *)lhs, *(kmp_int64 *)&old_value, *(kmp_int64 *)&new_value)) {
+      KMP_CPU_PAUSE();
+
+      old_value = *(kmp_int64 *)lhs;
+      (*f)(&new_value, &old_value, rhs);
+    }
+
+    return;
+  } else {
+// Use __kmp_atomic_lock_8i for all 8-byte data,
+// even if it isn't of integer data type.
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_acquire_atomic_lock(&__kmp_atomic_lock_8i, gtid);
+
+    (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+    if (__kmp_atomic_mode == 2) {
+      __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+      __kmp_release_atomic_lock(&__kmp_atomic_lock_8i, gtid);
+  }
+}
+
+void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_10r, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_10r, gtid);
+}
+
+void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_16c, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_16c, gtid);
+}
+
+void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_20c, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_20c, gtid);
+}
+
+void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *)) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_acquire_atomic_lock(&__kmp_atomic_lock_32c, gtid);
+
+  (*f)(lhs, lhs, rhs);
+
+#ifdef KMP_GOMP_COMPAT
+  if (__kmp_atomic_mode == 2) {
+    __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+  } else
+#endif /* KMP_GOMP_COMPAT */
+    __kmp_release_atomic_lock(&__kmp_atomic_lock_32c, gtid);
+}
+
+// AC: same two routines as GOMP_atomic_start/end, but will be called by our
+// compiler; duplicated in order to not use 3-party names in pure Intel code
+// TODO: consider adding GTID parameter after consultation with Ernesto/Xinmin.
+void __kmpc_atomic_start(void) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("__kmpc_atomic_start: T#%d\n", gtid));
+  __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+void __kmpc_atomic_end(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("__kmpc_atomic_end: T#%d\n", gtid));
+  __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+/*!
+@}
+*/
+
+// end of file
diff --git a/final/runtime/src/kmp_atomic.h b/final/runtime/src/kmp_atomic.h
new file mode 100644
index 0000000..bb01c31
--- /dev/null
+++ b/final/runtime/src/kmp_atomic.h
@@ -0,0 +1,1767 @@
+/*
+ * kmp_atomic.h - ATOMIC header file
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ATOMIC_H
+#define KMP_ATOMIC_H
+
+#include "kmp_lock.h"
+#include "kmp_os.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// C++ build port.
+// Intel compiler does not support _Complex datatype on win.
+// Intel compiler supports _Complex datatype on lin and mac.
+// On the other side, there is a problem of stack alignment on lin_32 and mac_32
+// if the rhs is cmplx80 or cmplx128 typedef'ed datatype.
+// The decision is: to use compiler supported _Complex type on lin and mac,
+//                  to use typedef'ed types on win.
+// Condition for WIN64 was modified in anticipation of 10.1 build compiler.
+
+#if defined(__cplusplus) && (KMP_OS_WINDOWS)
+// create shortcuts for c99 complex types
+
+// Visual Studio cannot have function parameters that have the
+// align __declspec attribute, so we must remove it. (Compiler Error C2719)
+#if KMP_COMPILER_MSVC
+#undef KMP_DO_ALIGN
+#define KMP_DO_ALIGN(alignment) /* Nothing */
+#endif
+
+#if (_MSC_VER < 1600) && defined(_DEBUG)
+// Workaround for the problem of _DebugHeapTag unresolved external.
+// This problem prevented to use our static debug library for C tests
+// compiled with /MDd option (the library itself built with /MTd),
+#undef _DEBUG
+#define _DEBUG_TEMPORARILY_UNSET_
+#endif
+
+#include <complex>
+
+template <typename type_lhs, typename type_rhs>
+std::complex<type_lhs> __kmp_lhs_div_rhs(const std::complex<type_lhs> &lhs,
+                                         const std::complex<type_rhs> &rhs) {
+  type_lhs a = lhs.real();
+  type_lhs b = lhs.imag();
+  type_rhs c = rhs.real();
+  type_rhs d = rhs.imag();
+  type_rhs den = c * c + d * d;
+  type_rhs r = (a * c + b * d);
+  type_rhs i = (b * c - a * d);
+  std::complex<type_lhs> ret(r / den, i / den);
+  return ret;
+}
+
+// complex8
+struct __kmp_cmplx64_t : std::complex<double> {
+
+  __kmp_cmplx64_t() : std::complex<double>() {}
+
+  __kmp_cmplx64_t(const std::complex<double> &cd) : std::complex<double>(cd) {}
+
+  void operator/=(const __kmp_cmplx64_t &rhs) {
+    std::complex<double> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx64_t operator/(const __kmp_cmplx64_t &rhs) {
+    std::complex<double> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef struct __kmp_cmplx64_t kmp_cmplx64;
+
+// complex4
+struct __kmp_cmplx32_t : std::complex<float> {
+
+  __kmp_cmplx32_t() : std::complex<float>() {}
+
+  __kmp_cmplx32_t(const std::complex<float> &cf) : std::complex<float>(cf) {}
+
+  __kmp_cmplx32_t operator+(const __kmp_cmplx32_t &b) {
+    std::complex<float> lhs = *this;
+    std::complex<float> rhs = b;
+    return (lhs + rhs);
+  }
+  __kmp_cmplx32_t operator-(const __kmp_cmplx32_t &b) {
+    std::complex<float> lhs = *this;
+    std::complex<float> rhs = b;
+    return (lhs - rhs);
+  }
+  __kmp_cmplx32_t operator*(const __kmp_cmplx32_t &b) {
+    std::complex<float> lhs = *this;
+    std::complex<float> rhs = b;
+    return (lhs * rhs);
+  }
+
+  __kmp_cmplx32_t operator+(const kmp_cmplx64 &b) {
+    kmp_cmplx64 t = kmp_cmplx64(*this) + b;
+    std::complex<double> d(t);
+    std::complex<float> f(d);
+    __kmp_cmplx32_t r(f);
+    return r;
+  }
+  __kmp_cmplx32_t operator-(const kmp_cmplx64 &b) {
+    kmp_cmplx64 t = kmp_cmplx64(*this) - b;
+    std::complex<double> d(t);
+    std::complex<float> f(d);
+    __kmp_cmplx32_t r(f);
+    return r;
+  }
+  __kmp_cmplx32_t operator*(const kmp_cmplx64 &b) {
+    kmp_cmplx64 t = kmp_cmplx64(*this) * b;
+    std::complex<double> d(t);
+    std::complex<float> f(d);
+    __kmp_cmplx32_t r(f);
+    return r;
+  }
+
+  void operator/=(const __kmp_cmplx32_t &rhs) {
+    std::complex<float> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx32_t operator/(const __kmp_cmplx32_t &rhs) {
+    std::complex<float> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  void operator/=(const kmp_cmplx64 &rhs) {
+    std::complex<float> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx32_t operator/(const kmp_cmplx64 &rhs) {
+    std::complex<float> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef struct __kmp_cmplx32_t kmp_cmplx32;
+
+// complex10
+struct KMP_DO_ALIGN(16) __kmp_cmplx80_t : std::complex<long double> {
+
+  __kmp_cmplx80_t() : std::complex<long double>() {}
+
+  __kmp_cmplx80_t(const std::complex<long double> &cld)
+      : std::complex<long double>(cld) {}
+
+  void operator/=(const __kmp_cmplx80_t &rhs) {
+    std::complex<long double> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx80_t operator/(const __kmp_cmplx80_t &rhs) {
+    std::complex<long double> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef KMP_DO_ALIGN(16) struct __kmp_cmplx80_t kmp_cmplx80;
+
+// complex16
+#if KMP_HAVE_QUAD
+struct __kmp_cmplx128_t : std::complex<_Quad> {
+
+  __kmp_cmplx128_t() : std::complex<_Quad>() {}
+
+  __kmp_cmplx128_t(const std::complex<_Quad> &cq) : std::complex<_Quad>(cq) {}
+
+  void operator/=(const __kmp_cmplx128_t &rhs) {
+    std::complex<_Quad> lhs = *this;
+    *this = __kmp_lhs_div_rhs(lhs, rhs);
+  }
+
+  __kmp_cmplx128_t operator/(const __kmp_cmplx128_t &rhs) {
+    std::complex<_Quad> lhs = *this;
+    return __kmp_lhs_div_rhs(lhs, rhs);
+  }
+};
+typedef struct __kmp_cmplx128_t kmp_cmplx128;
+#endif /* KMP_HAVE_QUAD */
+
+#ifdef _DEBUG_TEMPORARILY_UNSET_
+#undef _DEBUG_TEMPORARILY_UNSET_
+// Set it back now
+#define _DEBUG 1
+#endif
+
+#else
+// create shortcuts for c99 complex types
+typedef float _Complex kmp_cmplx32;
+typedef double _Complex kmp_cmplx64;
+typedef long double _Complex kmp_cmplx80;
+#if KMP_HAVE_QUAD
+typedef _Quad _Complex kmp_cmplx128;
+#endif
+#endif
+
+// Compiler 12.0 changed alignment of 16 and 32-byte arguments (like _Quad
+// and kmp_cmplx128) on IA-32 architecture. The following aligned structures
+// are implemented to support the old alignment in 10.1, 11.0, 11.1 and
+// introduce the new alignment in 12.0. See CQ88405.
+#if KMP_ARCH_X86 && KMP_HAVE_QUAD
+
+// 4-byte aligned structures for backward compatibility.
+
+#pragma pack(push, 4)
+
+struct KMP_DO_ALIGN(4) Quad_a4_t {
+  _Quad q;
+
+  Quad_a4_t() : q() {}
+  Quad_a4_t(const _Quad &cq) : q(cq) {}
+
+  Quad_a4_t operator+(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs + rhs);
+  }
+
+  Quad_a4_t operator-(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs - rhs);
+  }
+  Quad_a4_t operator*(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs * rhs);
+  }
+
+  Quad_a4_t operator/(const Quad_a4_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a4_t)(lhs / rhs);
+  }
+};
+
+struct KMP_DO_ALIGN(4) kmp_cmplx128_a4_t {
+  kmp_cmplx128 q;
+
+  kmp_cmplx128_a4_t() : q() {}
+
+  kmp_cmplx128_a4_t(const kmp_cmplx128 &c128) : q(c128) {}
+
+  kmp_cmplx128_a4_t operator+(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs + rhs);
+  }
+  kmp_cmplx128_a4_t operator-(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs - rhs);
+  }
+  kmp_cmplx128_a4_t operator*(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs * rhs);
+  }
+
+  kmp_cmplx128_a4_t operator/(const kmp_cmplx128_a4_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a4_t)(lhs / rhs);
+  }
+};
+
+#pragma pack(pop)
+
+// New 16-byte aligned structures for 12.0 compiler.
+struct KMP_DO_ALIGN(16) Quad_a16_t {
+  _Quad q;
+
+  Quad_a16_t() : q() {}
+  Quad_a16_t(const _Quad &cq) : q(cq) {}
+
+  Quad_a16_t operator+(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs + rhs);
+  }
+
+  Quad_a16_t operator-(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs - rhs);
+  }
+  Quad_a16_t operator*(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs * rhs);
+  }
+
+  Quad_a16_t operator/(const Quad_a16_t &b) {
+    _Quad lhs = (*this).q;
+    _Quad rhs = b.q;
+    return (Quad_a16_t)(lhs / rhs);
+  }
+};
+
+struct KMP_DO_ALIGN(16) kmp_cmplx128_a16_t {
+  kmp_cmplx128 q;
+
+  kmp_cmplx128_a16_t() : q() {}
+
+  kmp_cmplx128_a16_t(const kmp_cmplx128 &c128) : q(c128) {}
+
+  kmp_cmplx128_a16_t operator+(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs + rhs);
+  }
+  kmp_cmplx128_a16_t operator-(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs - rhs);
+  }
+  kmp_cmplx128_a16_t operator*(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs * rhs);
+  }
+
+  kmp_cmplx128_a16_t operator/(const kmp_cmplx128_a16_t &b) {
+    kmp_cmplx128 lhs = (*this).q;
+    kmp_cmplx128 rhs = b.q;
+    return (kmp_cmplx128_a16_t)(lhs / rhs);
+  }
+};
+
+#endif
+
+#if (KMP_ARCH_X86)
+#define QUAD_LEGACY Quad_a4_t
+#define CPLX128_LEG kmp_cmplx128_a4_t
+#else
+#define QUAD_LEGACY _Quad
+#define CPLX128_LEG kmp_cmplx128
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int __kmp_atomic_mode;
+
+// Atomic locks can easily become contended, so we use queuing locks for them.
+typedef kmp_queuing_lock_t kmp_atomic_lock_t;
+
+static inline void __kmp_acquire_atomic_lock(kmp_atomic_lock_t *lck,
+                                             kmp_int32 gtid) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_atomic, 0, kmp_mutex_impl_queuing, (ompt_wait_id_t)(uintptr_t)lck,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  __kmp_acquire_queuing_lock(lck, gtid);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+static inline int __kmp_test_atomic_lock(kmp_atomic_lock_t *lck,
+                                         kmp_int32 gtid) {
+  return __kmp_test_queuing_lock(lck, gtid);
+}
+
+static inline void __kmp_release_atomic_lock(kmp_atomic_lock_t *lck,
+                                             kmp_int32 gtid) {
+  __kmp_release_queuing_lock(lck, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_atomic, (ompt_wait_id_t)(uintptr_t)lck, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+static inline void __kmp_init_atomic_lock(kmp_atomic_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+}
+
+static inline void __kmp_destroy_atomic_lock(kmp_atomic_lock_t *lck) {
+  __kmp_destroy_queuing_lock(lck);
+}
+
+// Global Locks
+extern kmp_atomic_lock_t __kmp_atomic_lock; /* Control access to all user coded
+                                               atomics in Gnu compat mode   */
+extern kmp_atomic_lock_t __kmp_atomic_lock_1i; /* Control access to all user
+                                                  coded atomics for 1-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_2i; /* Control access to all user
+                                                  coded atomics for 2-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_4i; /* Control access to all user
+                                                  coded atomics for 4-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_4r; /* Control access to all user
+                                                  coded atomics for kmp_real32
+                                                  data type    */
+extern kmp_atomic_lock_t __kmp_atomic_lock_8i; /* Control access to all user
+                                                  coded atomics for 8-byte fixed
+                                                  data types */
+extern kmp_atomic_lock_t __kmp_atomic_lock_8r; /* Control access to all user
+                                                  coded atomics for kmp_real64
+                                                  data type    */
+extern kmp_atomic_lock_t
+    __kmp_atomic_lock_8c; /* Control access to all user coded atomics for
+                             complex byte data type  */
+extern kmp_atomic_lock_t
+    __kmp_atomic_lock_10r; /* Control access to all user coded atomics for long
+                              double data type   */
+extern kmp_atomic_lock_t __kmp_atomic_lock_16r; /* Control access to all user
+                                                   coded atomics for _Quad data
+                                                   type         */
+extern kmp_atomic_lock_t __kmp_atomic_lock_16c; /* Control access to all user
+                                                   coded atomics for double
+                                                   complex data type*/
+extern kmp_atomic_lock_t
+    __kmp_atomic_lock_20c; /* Control access to all user coded atomics for long
+                              double complex type*/
+extern kmp_atomic_lock_t __kmp_atomic_lock_32c; /* Control access to all user
+                                                   coded atomics for _Quad
+                                                   complex data type */
+
+//  Below routines for atomic UPDATE are listed
+
+// 1-byte
+void __kmpc_atomic_fixed1_add(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_andb(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_div(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1u_div(ident_t *id_ref, int gtid, unsigned char *lhs,
+                               unsigned char rhs);
+void __kmpc_atomic_fixed1_mul(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_orb(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_shl(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_shr(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1u_shr(ident_t *id_ref, int gtid, unsigned char *lhs,
+                               unsigned char rhs);
+void __kmpc_atomic_fixed1_sub(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_xor(ident_t *id_ref, int gtid, char *lhs, char rhs);
+// 2-byte
+void __kmpc_atomic_fixed2_add(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_andb(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+void __kmpc_atomic_fixed2_div(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2u_div(ident_t *id_ref, int gtid, unsigned short *lhs,
+                               unsigned short rhs);
+void __kmpc_atomic_fixed2_mul(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_orb(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_shl(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_shr(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2u_shr(ident_t *id_ref, int gtid, unsigned short *lhs,
+                               unsigned short rhs);
+void __kmpc_atomic_fixed2_sub(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_xor(ident_t *id_ref, int gtid, short *lhs, short rhs);
+// 4-byte add / sub fixed
+void __kmpc_atomic_fixed4_add(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_sub(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+// 4-byte add / sub float
+void __kmpc_atomic_float4_add(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float4_sub(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+// 8-byte add / sub fixed
+void __kmpc_atomic_fixed8_add(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_sub(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// 8-byte add / sub float
+void __kmpc_atomic_float8_add(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+void __kmpc_atomic_float8_sub(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+// 4-byte fixed
+void __kmpc_atomic_fixed4_andb(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                               kmp_int32 rhs);
+void __kmpc_atomic_fixed4_div(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_div(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                               kmp_uint32 rhs);
+void __kmpc_atomic_fixed4_mul(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_orb(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_shl(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_shr(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_shr(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                               kmp_uint32 rhs);
+void __kmpc_atomic_fixed4_xor(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+// 8-byte fixed
+void __kmpc_atomic_fixed8_andb(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                               kmp_int64 rhs);
+void __kmpc_atomic_fixed8_div(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_div(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                               kmp_uint64 rhs);
+void __kmpc_atomic_fixed8_mul(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_orb(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_shl(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_shr(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_shr(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                               kmp_uint64 rhs);
+void __kmpc_atomic_fixed8_xor(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// 4-byte float
+void __kmpc_atomic_float4_div(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float4_mul(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+// 8-byte float
+void __kmpc_atomic_float8_div(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+void __kmpc_atomic_float8_mul(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+// 1-, 2-, 4-, 8-byte logical (&&, ||)
+void __kmpc_atomic_fixed1_andl(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_orl(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_andl(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+void __kmpc_atomic_fixed2_orl(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_andl(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                               kmp_int32 rhs);
+void __kmpc_atomic_fixed4_orl(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed8_andl(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                               kmp_int64 rhs);
+void __kmpc_atomic_fixed8_orl(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// MIN / MAX
+void __kmpc_atomic_fixed1_max(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed1_min(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_max(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed2_min(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_max(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed4_min(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed8_max(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_fixed8_min(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+void __kmpc_atomic_float4_max(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float4_min(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                              kmp_real32 rhs);
+void __kmpc_atomic_float8_max(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+void __kmpc_atomic_float8_min(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                              kmp_real64 rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_max(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_min(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary; IA-32
+// architecture only
+void __kmpc_atomic_float16_max_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_min_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+#endif
+#endif
+// .NEQV. (same as xor)
+void __kmpc_atomic_fixed1_neqv(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_neqv(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+void __kmpc_atomic_fixed4_neqv(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                               kmp_int32 rhs);
+void __kmpc_atomic_fixed8_neqv(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                               kmp_int64 rhs);
+// .EQV. (same as ~xor)
+void __kmpc_atomic_fixed1_eqv(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_eqv(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_eqv(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                              kmp_int32 rhs);
+void __kmpc_atomic_fixed8_eqv(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                              kmp_int64 rhs);
+// long double type
+void __kmpc_atomic_float10_add(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_sub(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_mul(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+void __kmpc_atomic_float10_div(ident_t *id_ref, int gtid, long double *lhs,
+                               long double rhs);
+// _Quad type
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_add(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_sub(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_mul(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_div(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                               QUAD_LEGACY rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_float16_add_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_sub_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_mul_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+void __kmpc_atomic_float16_div_a16(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                   Quad_a16_t rhs);
+#endif
+#endif
+// routines for complex types
+void __kmpc_atomic_cmplx4_add(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_sub(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_mul(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_div(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx8_add(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_sub(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_mul(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_div(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                              kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx10_add(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_sub(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_mul(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_div(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                               kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_add(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_sub(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_mul(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_div(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                               CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_cmplx16_add_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_sub_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_mul_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_div_a16(ident_t *id_ref, int gtid,
+                                   kmp_cmplx128_a16_t *lhs,
+                                   kmp_cmplx128_a16_t rhs);
+#endif
+#endif
+
+// OpenMP 4.0: x = expr binop x for non-commutative operations.
+// Supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+void __kmpc_atomic_fixed1_sub_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1_div_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1u_div_rev(ident_t *id_ref, int gtid,
+                                   unsigned char *lhs, unsigned char rhs);
+void __kmpc_atomic_fixed1_shl_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1_shr_rev(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs);
+void __kmpc_atomic_fixed1u_shr_rev(ident_t *id_ref, int gtid,
+                                   unsigned char *lhs, unsigned char rhs);
+void __kmpc_atomic_fixed2_sub_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2_div_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2u_div_rev(ident_t *id_ref, int gtid,
+                                   unsigned short *lhs, unsigned short rhs);
+void __kmpc_atomic_fixed2_shl_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2_shr_rev(ident_t *id_ref, int gtid, short *lhs,
+                                  short rhs);
+void __kmpc_atomic_fixed2u_shr_rev(ident_t *id_ref, int gtid,
+                                   unsigned short *lhs, unsigned short rhs);
+void __kmpc_atomic_fixed4_sub_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4_div_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_div_rev(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                   kmp_uint32 rhs);
+void __kmpc_atomic_fixed4_shl_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4_shr_rev(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                  kmp_int32 rhs);
+void __kmpc_atomic_fixed4u_shr_rev(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                   kmp_uint32 rhs);
+void __kmpc_atomic_fixed8_sub_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8_div_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_div_rev(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                   kmp_uint64 rhs);
+void __kmpc_atomic_fixed8_shl_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8_shr_rev(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                  kmp_int64 rhs);
+void __kmpc_atomic_fixed8u_shr_rev(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                   kmp_uint64 rhs);
+void __kmpc_atomic_float4_sub_rev(ident_t *id_ref, int gtid, float *lhs,
+                                  float rhs);
+void __kmpc_atomic_float4_div_rev(ident_t *id_ref, int gtid, float *lhs,
+                                  float rhs);
+void __kmpc_atomic_float8_sub_rev(ident_t *id_ref, int gtid, double *lhs,
+                                  double rhs);
+void __kmpc_atomic_float8_div_rev(ident_t *id_ref, int gtid, double *lhs,
+                                  double rhs);
+void __kmpc_atomic_float10_sub_rev(ident_t *id_ref, int gtid, long double *lhs,
+                                   long double rhs);
+void __kmpc_atomic_float10_div_rev(ident_t *id_ref, int gtid, long double *lhs,
+                                   long double rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_sub_rev(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                                   QUAD_LEGACY rhs);
+void __kmpc_atomic_float16_div_rev(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                                   QUAD_LEGACY rhs);
+#endif
+void __kmpc_atomic_cmplx4_sub_rev(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx4_div_rev(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx8_sub_rev(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                                  kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx8_div_rev(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                                  kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx10_sub_rev(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                                   kmp_cmplx80 rhs);
+void __kmpc_atomic_cmplx10_div_rev(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                                   kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_sub_rev(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                                   CPLX128_LEG rhs);
+void __kmpc_atomic_cmplx16_div_rev(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                                   CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_float16_sub_a16_rev(ident_t *id_ref, int gtid,
+                                       Quad_a16_t *lhs, Quad_a16_t rhs);
+void __kmpc_atomic_float16_div_a16_rev(ident_t *id_ref, int gtid,
+                                       Quad_a16_t *lhs, Quad_a16_t rhs);
+void __kmpc_atomic_cmplx16_sub_a16_rev(ident_t *id_ref, int gtid,
+                                       kmp_cmplx128_a16_t *lhs,
+                                       kmp_cmplx128_a16_t rhs);
+void __kmpc_atomic_cmplx16_div_a16_rev(ident_t *id_ref, int gtid,
+                                       kmp_cmplx128_a16_t *lhs,
+                                       kmp_cmplx128_a16_t rhs);
+#endif
+#endif // KMP_HAVE_QUAD
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+// routines for mixed types
+
+// RHS=float8
+void __kmpc_atomic_fixed1_mul_float8(ident_t *id_ref, int gtid, char *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed1_div_float8(ident_t *id_ref, int gtid, char *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed2_mul_float8(ident_t *id_ref, int gtid, short *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed2_div_float8(ident_t *id_ref, int gtid, short *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed4_mul_float8(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed4_div_float8(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed8_mul_float8(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_fixed8_div_float8(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_add_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_sub_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_mul_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+void __kmpc_atomic_float4_div_float8(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                     kmp_real64 rhs);
+
+// RHS=float16 (deprecated, to be removed when we are sure the compiler does not
+// use them)
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_fixed1_add_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_add_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed1_sub_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_sub_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed1_mul_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_mul_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed1_div_fp(ident_t *id_ref, int gtid, char *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed1u_div_fp(ident_t *id_ref, int gtid, unsigned char *lhs,
+                                  _Quad rhs);
+
+void __kmpc_atomic_fixed2_add_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_add_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_sub_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_sub_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_mul_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_mul_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_div_fp(ident_t *id_ref, int gtid, short *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed2u_div_fp(ident_t *id_ref, int gtid,
+                                  unsigned short *lhs, _Quad rhs);
+
+void __kmpc_atomic_fixed4_add_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_add_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed4_sub_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_sub_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed4_mul_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_mul_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed4_div_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed4u_div_fp(ident_t *id_ref, int gtid, kmp_uint32 *lhs,
+                                  _Quad rhs);
+
+void __kmpc_atomic_fixed8_add_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_add_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed8_sub_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_sub_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed8_mul_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_mul_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_fixed8_div_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_fixed8u_div_fp(ident_t *id_ref, int gtid, kmp_uint64 *lhs,
+                                  _Quad rhs);
+
+void __kmpc_atomic_float4_add_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float4_sub_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float4_mul_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float4_div_fp(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                                 _Quad rhs);
+
+void __kmpc_atomic_float8_add_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float8_sub_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float8_mul_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+void __kmpc_atomic_float8_div_fp(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                                 _Quad rhs);
+
+void __kmpc_atomic_float10_add_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_float10_sub_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_float10_mul_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+void __kmpc_atomic_float10_div_fp(ident_t *id_ref, int gtid, long double *lhs,
+                                  _Quad rhs);
+
+// Reverse operations
+void __kmpc_atomic_fixed1_sub_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed1u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned char *lhs, _Quad rhs);
+void __kmpc_atomic_fixed1_div_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed1u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned char *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_sub_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed2u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed2_div_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed2u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      unsigned short *lhs, _Quad rhs);
+void __kmpc_atomic_fixed4_sub_rev_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed4u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint32 *lhs, _Quad rhs);
+void __kmpc_atomic_fixed4_div_rev_fp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed4u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint32 *lhs, _Quad rhs);
+void __kmpc_atomic_fixed8_sub_rev_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed8u_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint64 *lhs, _Quad rhs);
+void __kmpc_atomic_fixed8_div_rev_fp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_fixed8u_div_rev_fp(ident_t *id_ref, int gtid,
+                                      kmp_uint64 *lhs, _Quad rhs);
+void __kmpc_atomic_float4_sub_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float4_div_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float8_sub_rev_fp(ident_t *id_ref, int gtid, double *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float8_div_rev_fp(ident_t *id_ref, int gtid, double *lhs,
+                                     _Quad rhs);
+void __kmpc_atomic_float10_sub_rev_fp(ident_t *id_ref, int gtid,
+                                      long double *lhs, _Quad rhs);
+void __kmpc_atomic_float10_div_rev_fp(ident_t *id_ref, int gtid,
+                                      long double *lhs, _Quad rhs);
+
+#endif // KMP_HAVE_QUAD
+
+// RHS=cmplx8
+void __kmpc_atomic_cmplx4_add_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx4_sub_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx4_mul_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx4_div_cmplx8(ident_t *id_ref, int gtid,
+                                     kmp_cmplx32 *lhs, kmp_cmplx64 rhs);
+
+// generic atomic routines
+void __kmpc_atomic_1(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_2(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_4(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_8(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                     void (*f)(void *, void *, void *));
+void __kmpc_atomic_10(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+void __kmpc_atomic_16(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+void __kmpc_atomic_20(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+void __kmpc_atomic_32(ident_t *id_ref, int gtid, void *lhs, void *rhs,
+                      void (*f)(void *, void *, void *));
+
+// READ, WRITE, CAPTURE are supported only on IA-32 architecture and Intel(R) 64
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+//  Below routines for atomic READ are listed
+char __kmpc_atomic_fixed1_rd(ident_t *id_ref, int gtid, char *loc);
+short __kmpc_atomic_fixed2_rd(ident_t *id_ref, int gtid, short *loc);
+kmp_int32 __kmpc_atomic_fixed4_rd(ident_t *id_ref, int gtid, kmp_int32 *loc);
+kmp_int64 __kmpc_atomic_fixed8_rd(ident_t *id_ref, int gtid, kmp_int64 *loc);
+kmp_real32 __kmpc_atomic_float4_rd(ident_t *id_ref, int gtid, kmp_real32 *loc);
+kmp_real64 __kmpc_atomic_float8_rd(ident_t *id_ref, int gtid, kmp_real64 *loc);
+long double __kmpc_atomic_float10_rd(ident_t *id_ref, int gtid,
+                                     long double *loc);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_rd(ident_t *id_ref, int gtid,
+                                     QUAD_LEGACY *loc);
+#endif
+// Fix for CQ220361: cmplx4 READ will return void on Windows* OS; read value
+// will be returned through an additional parameter
+#if (KMP_OS_WINDOWS)
+void __kmpc_atomic_cmplx4_rd(kmp_cmplx32 *out, ident_t *id_ref, int gtid,
+                             kmp_cmplx32 *loc);
+#else
+kmp_cmplx32 __kmpc_atomic_cmplx4_rd(ident_t *id_ref, int gtid,
+                                    kmp_cmplx32 *loc);
+#endif
+kmp_cmplx64 __kmpc_atomic_cmplx8_rd(ident_t *id_ref, int gtid,
+                                    kmp_cmplx64 *loc);
+kmp_cmplx80 __kmpc_atomic_cmplx10_rd(ident_t *id_ref, int gtid,
+                                     kmp_cmplx80 *loc);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_rd(ident_t *id_ref, int gtid,
+                                     CPLX128_LEG *loc);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+Quad_a16_t __kmpc_atomic_float16_a16_rd(ident_t *id_ref, int gtid,
+                                        Quad_a16_t *loc);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_a16_rd(ident_t *id_ref, int gtid,
+                                                kmp_cmplx128_a16_t *loc);
+#endif
+#endif
+
+//  Below routines for atomic WRITE are listed
+void __kmpc_atomic_fixed1_wr(ident_t *id_ref, int gtid, char *lhs, char rhs);
+void __kmpc_atomic_fixed2_wr(ident_t *id_ref, int gtid, short *lhs, short rhs);
+void __kmpc_atomic_fixed4_wr(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                             kmp_int32 rhs);
+void __kmpc_atomic_fixed8_wr(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                             kmp_int64 rhs);
+void __kmpc_atomic_float4_wr(ident_t *id_ref, int gtid, kmp_real32 *lhs,
+                             kmp_real32 rhs);
+void __kmpc_atomic_float8_wr(ident_t *id_ref, int gtid, kmp_real64 *lhs,
+                             kmp_real64 rhs);
+void __kmpc_atomic_float10_wr(ident_t *id_ref, int gtid, long double *lhs,
+                              long double rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_float16_wr(ident_t *id_ref, int gtid, QUAD_LEGACY *lhs,
+                              QUAD_LEGACY rhs);
+#endif
+void __kmpc_atomic_cmplx4_wr(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                             kmp_cmplx32 rhs);
+void __kmpc_atomic_cmplx8_wr(ident_t *id_ref, int gtid, kmp_cmplx64 *lhs,
+                             kmp_cmplx64 rhs);
+void __kmpc_atomic_cmplx10_wr(ident_t *id_ref, int gtid, kmp_cmplx80 *lhs,
+                              kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+void __kmpc_atomic_cmplx16_wr(ident_t *id_ref, int gtid, CPLX128_LEG *lhs,
+                              CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+void __kmpc_atomic_float16_a16_wr(ident_t *id_ref, int gtid, Quad_a16_t *lhs,
+                                  Quad_a16_t rhs);
+void __kmpc_atomic_cmplx16_a16_wr(ident_t *id_ref, int gtid,
+                                  kmp_cmplx128_a16_t *lhs,
+                                  kmp_cmplx128_a16_t rhs);
+#endif
+#endif
+
+//  Below routines for atomic CAPTURE are listed
+
+// 1-byte
+char __kmpc_atomic_fixed1_add_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_andb_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                   char rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt(ident_t *id_ref, int gtid,
+                                            unsigned char *lhs,
+                                            unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_mul_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_orb_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_shl_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_shr_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_shr_cpt(ident_t *id_ref, int gtid,
+                                            unsigned char *lhs,
+                                            unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_sub_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_xor_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+// 2-byte
+short __kmpc_atomic_fixed2_add_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_andb_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                    short rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt(ident_t *id_ref, int gtid,
+                                             unsigned short *lhs,
+                                             unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_mul_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_orb_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_shl_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_shr_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_shr_cpt(ident_t *id_ref, int gtid,
+                                             unsigned short *lhs,
+                                             unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_xor_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+// 4-byte add / sub fixed
+kmp_int32 __kmpc_atomic_fixed4_add_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+// 4-byte add / sub float
+kmp_real32 __kmpc_atomic_float4_add_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real32 __kmpc_atomic_float4_sub_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+// 8-byte add / sub fixed
+kmp_int64 __kmpc_atomic_fixed8_add_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// 8-byte add / sub float
+kmp_real64 __kmpc_atomic_float8_add_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_sub_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+// 4-byte fixed
+kmp_int32 __kmpc_atomic_fixed4_andb_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int32 *lhs, kmp_int32 rhs,
+                                        int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint32 *lhs, kmp_uint32 rhs,
+                                         int flag);
+kmp_int32 __kmpc_atomic_fixed4_mul_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_orb_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_shl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_shr_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_shr_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint32 *lhs, kmp_uint32 rhs,
+                                         int flag);
+kmp_int32 __kmpc_atomic_fixed4_xor_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+// 8-byte fixed
+kmp_int64 __kmpc_atomic_fixed8_andb_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int64 *lhs, kmp_int64 rhs,
+                                        int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint64 *lhs, kmp_uint64 rhs,
+                                         int flag);
+kmp_int64 __kmpc_atomic_fixed8_mul_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_orb_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_shl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_shr_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_shr_cpt(ident_t *id_ref, int gtid,
+                                         kmp_uint64 *lhs, kmp_uint64 rhs,
+                                         int flag);
+kmp_int64 __kmpc_atomic_fixed8_xor_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// 4-byte float
+kmp_real32 __kmpc_atomic_float4_div_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real32 __kmpc_atomic_float4_mul_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+// 8-byte float
+kmp_real64 __kmpc_atomic_float8_div_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_mul_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+// 1-, 2-, 4-, 8-byte logical (&&, ||)
+char __kmpc_atomic_fixed1_andl_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                   char rhs, int flag);
+char __kmpc_atomic_fixed1_orl_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+short __kmpc_atomic_fixed2_andl_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                    short rhs, int flag);
+short __kmpc_atomic_fixed2_orl_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_andl_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int32 *lhs, kmp_int32 rhs,
+                                        int flag);
+kmp_int32 __kmpc_atomic_fixed4_orl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_andl_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int64 *lhs, kmp_int64 rhs,
+                                        int flag);
+kmp_int64 __kmpc_atomic_fixed8_orl_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// MIN / MAX
+char __kmpc_atomic_fixed1_max_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+char __kmpc_atomic_fixed1_min_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+short __kmpc_atomic_fixed2_max_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+short __kmpc_atomic_fixed2_min_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_max_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_min_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_max_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_min_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+kmp_real32 __kmpc_atomic_float4_max_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real32 __kmpc_atomic_float4_min_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real32 *lhs, kmp_real32 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_max_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+kmp_real64 __kmpc_atomic_float8_min_cpt(ident_t *id_ref, int gtid,
+                                        kmp_real64 *lhs, kmp_real64 rhs,
+                                        int flag);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_max_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_min_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+#endif
+// .NEQV. (same as xor)
+char __kmpc_atomic_fixed1_neqv_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                   char rhs, int flag);
+short __kmpc_atomic_fixed2_neqv_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                    short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_neqv_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int32 *lhs, kmp_int32 rhs,
+                                        int flag);
+kmp_int64 __kmpc_atomic_fixed8_neqv_cpt(ident_t *id_ref, int gtid,
+                                        kmp_int64 *lhs, kmp_int64 rhs,
+                                        int flag);
+// .EQV. (same as ~xor)
+char __kmpc_atomic_fixed1_eqv_cpt(ident_t *id_ref, int gtid, char *lhs,
+                                  char rhs, int flag);
+short __kmpc_atomic_fixed2_eqv_cpt(ident_t *id_ref, int gtid, short *lhs,
+                                   short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_eqv_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int32 *lhs, kmp_int32 rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_eqv_cpt(ident_t *id_ref, int gtid,
+                                       kmp_int64 *lhs, kmp_int64 rhs, int flag);
+// long double type
+long double __kmpc_atomic_float10_add_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_sub_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_mul_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+long double __kmpc_atomic_float10_div_cpt(ident_t *id_ref, int gtid,
+                                          long double *lhs, long double rhs,
+                                          int flag);
+#if KMP_HAVE_QUAD
+// _Quad type
+QUAD_LEGACY __kmpc_atomic_float16_add_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_sub_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_mul_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+QUAD_LEGACY __kmpc_atomic_float16_div_cpt(ident_t *id_ref, int gtid,
+                                          QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                          int flag);
+#endif
+// routines for complex types
+// Workaround for cmplx4 routines - return void; captured value is returned via
+// the argument
+void __kmpc_atomic_cmplx4_add_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_sub_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_mul_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_div_cpt(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                                  kmp_cmplx32 rhs, kmp_cmplx32 *out, int flag);
+
+kmp_cmplx64 __kmpc_atomic_cmplx8_add_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_sub_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_mul_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_div_cpt(ident_t *id_ref, int gtid,
+                                         kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                         int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_add_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_sub_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_mul_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_div_cpt(ident_t *id_ref, int gtid,
+                                          kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                          int flag);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_add_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_sub_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_mul_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_div_cpt(ident_t *id_ref, int gtid,
+                                          CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                          int flag);
+#if (KMP_ARCH_X86)
+// Routines with 16-byte arguments aligned to 16-byte boundary
+Quad_a16_t __kmpc_atomic_float16_add_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_sub_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_mul_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_div_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_max_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+Quad_a16_t __kmpc_atomic_float16_min_a16_cpt(ident_t *id_ref, int gtid,
+                                             Quad_a16_t *lhs, Quad_a16_t rhs,
+                                             int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_add_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_sub_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_mul_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_div_a16_cpt(ident_t *id_ref, int gtid,
+                                                     kmp_cmplx128_a16_t *lhs,
+                                                     kmp_cmplx128_a16_t rhs,
+                                                     int flag);
+#endif
+#endif
+
+void __kmpc_atomic_start(void);
+void __kmpc_atomic_end(void);
+
+// OpenMP 4.0: v = x = expr binop x; { v = x; x = expr binop x; } { x = expr
+// binop x; v = x; }  for non-commutative operations.
+
+char __kmpc_atomic_fixed1_sub_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                                unsigned char *lhs,
+                                                unsigned char rhs, int flag);
+char __kmpc_atomic_fixed1_shl_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+char __kmpc_atomic_fixed1_shr_cpt_rev(ident_t *id_ref, int gtid, char *lhs,
+                                      char rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                                unsigned char *lhs,
+                                                unsigned char rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                                 unsigned short *lhs,
+                                                 unsigned short rhs, int flag);
+short __kmpc_atomic_fixed2_shl_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+short __kmpc_atomic_fixed2_shr_cpt_rev(ident_t *id_ref, int gtid, short *lhs,
+                                       short rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                                 unsigned short *lhs,
+                                                 unsigned short rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint32 *lhs, kmp_uint32 rhs,
+                                             int flag);
+kmp_int32 __kmpc_atomic_fixed4_shl_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_int32 __kmpc_atomic_fixed4_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int32 *lhs, kmp_int32 rhs,
+                                           int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint32 *lhs, kmp_uint32 rhs,
+                                             int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint64 *lhs, kmp_uint64 rhs,
+                                             int flag);
+kmp_int64 __kmpc_atomic_fixed8_shl_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_int64 __kmpc_atomic_fixed8_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                           kmp_int64 *lhs, kmp_int64 rhs,
+                                           int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_shr_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_uint64 *lhs, kmp_uint64 rhs,
+                                             int flag);
+float __kmpc_atomic_float4_sub_cpt_rev(ident_t *id_ref, int gtid, float *lhs,
+                                       float rhs, int flag);
+float __kmpc_atomic_float4_div_cpt_rev(ident_t *id_ref, int gtid, float *lhs,
+                                       float rhs, int flag);
+double __kmpc_atomic_float8_sub_cpt_rev(ident_t *id_ref, int gtid, double *lhs,
+                                        double rhs, int flag);
+double __kmpc_atomic_float8_div_cpt_rev(ident_t *id_ref, int gtid, double *lhs,
+                                        double rhs, int flag);
+long double __kmpc_atomic_float10_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              long double *lhs, long double rhs,
+                                              int flag);
+long double __kmpc_atomic_float10_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              long double *lhs, long double rhs,
+                                              int flag);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                              int flag);
+QUAD_LEGACY __kmpc_atomic_float16_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              QUAD_LEGACY *lhs, QUAD_LEGACY rhs,
+                                              int flag);
+#endif
+// Workaround for cmplx4 routines - return void; captured value is returned via
+// the argument
+void __kmpc_atomic_cmplx4_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx32 *lhs, kmp_cmplx32 rhs,
+                                      kmp_cmplx32 *out, int flag);
+void __kmpc_atomic_cmplx4_div_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx32 *lhs, kmp_cmplx32 rhs,
+                                      kmp_cmplx32 *out, int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                             int flag);
+kmp_cmplx64 __kmpc_atomic_cmplx8_div_cpt_rev(ident_t *id_ref, int gtid,
+                                             kmp_cmplx64 *lhs, kmp_cmplx64 rhs,
+                                             int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                              int flag);
+kmp_cmplx80 __kmpc_atomic_cmplx10_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              kmp_cmplx80 *lhs, kmp_cmplx80 rhs,
+                                              int flag);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_sub_cpt_rev(ident_t *id_ref, int gtid,
+                                              CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                              int flag);
+CPLX128_LEG __kmpc_atomic_cmplx16_div_cpt_rev(ident_t *id_ref, int gtid,
+                                              CPLX128_LEG *lhs, CPLX128_LEG rhs,
+                                              int flag);
+#if (KMP_ARCH_X86)
+Quad_a16_t __kmpc_atomic_float16_sub_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                                 Quad_a16_t *lhs,
+                                                 Quad_a16_t rhs, int flag);
+Quad_a16_t __kmpc_atomic_float16_div_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                                 Quad_a16_t *lhs,
+                                                 Quad_a16_t rhs, int flag);
+kmp_cmplx128_a16_t
+__kmpc_atomic_cmplx16_sub_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx128_a16_t *lhs,
+                                      kmp_cmplx128_a16_t rhs, int flag);
+kmp_cmplx128_a16_t
+__kmpc_atomic_cmplx16_div_a16_cpt_rev(ident_t *id_ref, int gtid,
+                                      kmp_cmplx128_a16_t *lhs,
+                                      kmp_cmplx128_a16_t rhs, int flag);
+#endif
+#endif
+
+//   OpenMP 4.0 Capture-write (swap): {v = x; x = expr;}
+char __kmpc_atomic_fixed1_swp(ident_t *id_ref, int gtid, char *lhs, char rhs);
+short __kmpc_atomic_fixed2_swp(ident_t *id_ref, int gtid, short *lhs,
+                               short rhs);
+kmp_int32 __kmpc_atomic_fixed4_swp(ident_t *id_ref, int gtid, kmp_int32 *lhs,
+                                   kmp_int32 rhs);
+kmp_int64 __kmpc_atomic_fixed8_swp(ident_t *id_ref, int gtid, kmp_int64 *lhs,
+                                   kmp_int64 rhs);
+float __kmpc_atomic_float4_swp(ident_t *id_ref, int gtid, float *lhs,
+                               float rhs);
+double __kmpc_atomic_float8_swp(ident_t *id_ref, int gtid, double *lhs,
+                                double rhs);
+long double __kmpc_atomic_float10_swp(ident_t *id_ref, int gtid,
+                                      long double *lhs, long double rhs);
+#if KMP_HAVE_QUAD
+QUAD_LEGACY __kmpc_atomic_float16_swp(ident_t *id_ref, int gtid,
+                                      QUAD_LEGACY *lhs, QUAD_LEGACY rhs);
+#endif
+// !!! TODO: check if we need a workaround here
+void __kmpc_atomic_cmplx4_swp(ident_t *id_ref, int gtid, kmp_cmplx32 *lhs,
+                              kmp_cmplx32 rhs, kmp_cmplx32 *out);
+// kmp_cmplx32   	__kmpc_atomic_cmplx4_swp(  ident_t *id_ref, int gtid,
+// kmp_cmplx32 * lhs, kmp_cmplx32 rhs );
+
+kmp_cmplx64 __kmpc_atomic_cmplx8_swp(ident_t *id_ref, int gtid,
+                                     kmp_cmplx64 *lhs, kmp_cmplx64 rhs);
+kmp_cmplx80 __kmpc_atomic_cmplx10_swp(ident_t *id_ref, int gtid,
+                                      kmp_cmplx80 *lhs, kmp_cmplx80 rhs);
+#if KMP_HAVE_QUAD
+CPLX128_LEG __kmpc_atomic_cmplx16_swp(ident_t *id_ref, int gtid,
+                                      CPLX128_LEG *lhs, CPLX128_LEG rhs);
+#if (KMP_ARCH_X86)
+Quad_a16_t __kmpc_atomic_float16_a16_swp(ident_t *id_ref, int gtid,
+                                         Quad_a16_t *lhs, Quad_a16_t rhs);
+kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_a16_swp(ident_t *id_ref, int gtid,
+                                                 kmp_cmplx128_a16_t *lhs,
+                                                 kmp_cmplx128_a16_t rhs);
+#endif
+#endif
+
+// Capture routines for mixed types (RHS=float16)
+#if KMP_HAVE_QUAD
+
+char __kmpc_atomic_fixed1_add_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_sub_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_mul_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt_fp(ident_t *id_ref, int gtid, char *lhs,
+                                     _Quad rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+unsigned char __kmpc_atomic_fixed1u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+unsigned char __kmpc_atomic_fixed1u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                               unsigned char *lhs, _Quad rhs,
+                                               int flag);
+
+short __kmpc_atomic_fixed2_add_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_mul_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt_fp(ident_t *id_ref, int gtid, short *lhs,
+                                      _Quad rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+unsigned short __kmpc_atomic_fixed2u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+unsigned short __kmpc_atomic_fixed2u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                                unsigned short *lhs, _Quad rhs,
+                                                int flag);
+
+kmp_int32 __kmpc_atomic_fixed4_add_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int32 *lhs, _Quad rhs, int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint32 *lhs, _Quad rhs,
+                                            int flag);
+
+kmp_int64 __kmpc_atomic_fixed8_add_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt_fp(ident_t *id_ref, int gtid,
+                                          kmp_int64 *lhs, _Quad rhs, int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_add_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt_fp(ident_t *id_ref, int gtid,
+                                            kmp_uint64 *lhs, _Quad rhs,
+                                            int flag);
+
+float __kmpc_atomic_float4_add_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+float __kmpc_atomic_float4_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+float __kmpc_atomic_float4_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+float __kmpc_atomic_float4_div_cpt_fp(ident_t *id_ref, int gtid,
+                                      kmp_real32 *lhs, _Quad rhs, int flag);
+
+double __kmpc_atomic_float8_add_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_div_cpt_fp(ident_t *id_ref, int gtid,
+                                       kmp_real64 *lhs, _Quad rhs, int flag);
+
+long double __kmpc_atomic_float10_add_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+long double __kmpc_atomic_float10_sub_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+long double __kmpc_atomic_float10_mul_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+long double __kmpc_atomic_float10_div_cpt_fp(ident_t *id_ref, int gtid,
+                                             long double *lhs, _Quad rhs,
+                                             int flag);
+
+char __kmpc_atomic_fixed1_sub_cpt_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                         _Quad rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                   unsigned char *lhs,
+                                                   _Quad rhs, int flag);
+char __kmpc_atomic_fixed1_div_cpt_rev_fp(ident_t *id_ref, int gtid, char *lhs,
+                                         _Quad rhs, int flag);
+unsigned char __kmpc_atomic_fixed1u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                   unsigned char *lhs,
+                                                   _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_sub_cpt_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                          _Quad rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                    unsigned short *lhs,
+                                                    _Quad rhs, int flag);
+short __kmpc_atomic_fixed2_div_cpt_rev_fp(ident_t *id_ref, int gtid, short *lhs,
+                                          _Quad rhs, int flag);
+unsigned short __kmpc_atomic_fixed2u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                    unsigned short *lhs,
+                                                    _Quad rhs, int flag);
+kmp_int32 __kmpc_atomic_fixed4_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int32 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint32 *lhs, _Quad rhs,
+                                                int flag);
+kmp_int32 __kmpc_atomic_fixed4_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int32 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint32 __kmpc_atomic_fixed4u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint32 *lhs, _Quad rhs,
+                                                int flag);
+kmp_int64 __kmpc_atomic_fixed8_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int64 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint64 *lhs, _Quad rhs,
+                                                int flag);
+kmp_int64 __kmpc_atomic_fixed8_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                              kmp_int64 *lhs, _Quad rhs,
+                                              int flag);
+kmp_uint64 __kmpc_atomic_fixed8u_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                kmp_uint64 *lhs, _Quad rhs,
+                                                int flag);
+float __kmpc_atomic_float4_sub_cpt_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                          _Quad rhs, int flag);
+float __kmpc_atomic_float4_div_cpt_rev_fp(ident_t *id_ref, int gtid, float *lhs,
+                                          _Quad rhs, int flag);
+double __kmpc_atomic_float8_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                           double *lhs, _Quad rhs, int flag);
+double __kmpc_atomic_float8_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                           double *lhs, _Quad rhs, int flag);
+long double __kmpc_atomic_float10_sub_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                 long double *lhs, _Quad rhs,
+                                                 int flag);
+long double __kmpc_atomic_float10_div_cpt_rev_fp(ident_t *id_ref, int gtid,
+                                                 long double *lhs, _Quad rhs,
+                                                 int flag);
+
+#endif // KMP_HAVE_QUAD
+
+// End of OpenMP 4.0 capture
+
+#endif // KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* KMP_ATOMIC_H */
+
+// end of file
diff --git a/final/runtime/src/kmp_barrier.cpp b/final/runtime/src/kmp_barrier.cpp
new file mode 100644
index 0000000..e17986b
--- /dev/null
+++ b/final/runtime/src/kmp_barrier.cpp
@@ -0,0 +1,2161 @@
+/*
+ * kmp_barrier.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_wait_release.h"
+#include "kmp_itt.h"
+#include "kmp_os.h"
+#include "kmp_stats.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#if KMP_MIC
+#include <immintrin.h>
+#define USE_NGO_STORES 1
+#endif // KMP_MIC
+
+#include "tsan_annotations.h"
+
+#if KMP_MIC && USE_NGO_STORES
+// ICV copying
+#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
+#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
+#else
+#define ngo_load(src) ((void)0)
+#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
+#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
+#define ngo_sync() ((void)0)
+#endif /* KMP_MIC && USE_NGO_STORES */
+
+void __kmp_print_structure(void); // Forward declaration
+
+// ---------------------------- Barrier Algorithms ----------------------------
+
+// Linear Barrier
+template <bool cancellable = false>
+static bool __kmp_linear_barrier_gather_template(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads = team->t.t_threads;
+
+  KA_TRACE(
+      20,
+      ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+  // We now perform a linear reduction to signal that all of the threads have
+  // arrived.
+  if (!KMP_MASTER_TID(tid)) {
+    KA_TRACE(20,
+             ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
+              "arrived(%p): %llu => %llu\n",
+              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
+              team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
+              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+    // Mark arrival to master thread
+    /* After performing this write, a worker thread may not assume that the team
+       is valid any more - it could be deallocated by the master thread at any
+       time. */
+    ANNOTATE_BARRIER_BEGIN(this_thr);
+    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
+    flag.release();
+  } else {
+    kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
+    int nproc = this_thr->th.th_team_nproc;
+    int i;
+    // Don't have to worry about sleep bit here or atomic since team setting
+    kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
+
+    // Collect all the worker team member threads.
+    for (i = 1; i < nproc; ++i) {
+#if KMP_CACHE_MANAGE
+      // Prefetch next thread's arrived count
+      if (i + 1 < nproc)
+        KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+      KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                    "arrived(%p) == %llu\n",
+                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
+                    team->t.t_id, i,
+                    &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
+
+      // Wait for worker thread to arrive
+      kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
+                       new_state);
+      if (cancellable) {
+        bool cancelled = flag.wait_cancellable_nosleep(
+            this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+        if (cancelled)
+          return true;
+      } else {
+        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+      }
+      ANNOTATE_BARRIER_END(other_threads[i]);
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier imbalance - write min of the thread time and the other thread
+      // time to the thread.
+      if (__kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_min_time = KMP_MIN(
+            this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
+      }
+#endif
+      if (reduce) {
+        KA_TRACE(100,
+                 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
+                  team->t.t_id, i));
+        ANNOTATE_REDUCE_AFTER(reduce);
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  other_threads[i]->th.th_local.reduce_data);
+        ANNOTATE_REDUCE_BEFORE(reduce);
+        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
+      }
+    }
+    // Don't have to worry about sleep bit here or atomic since team setting
+    team_bar->b_arrived = new_state;
+    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
+                  new_state));
+  }
+  KA_TRACE(
+      20,
+      ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  return false;
+}
+
+template <bool cancellable = false>
+static bool __kmp_linear_barrier_release_template(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_team_t *team;
+
+  if (KMP_MASTER_TID(tid)) {
+    unsigned int i;
+    kmp_uint32 nproc = this_thr->th.th_team_nproc;
+    kmp_info_t **other_threads;
+
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    other_threads = team->t.t_threads;
+
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
+                  "barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+
+    if (nproc > 1) {
+#if KMP_BARRIER_ICV_PUSH
+      {
+        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
+        if (propagate_icvs) {
+          ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
+          for (i = 1; i < nproc; ++i) {
+            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
+                                     team, i, FALSE);
+            ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
+                           &team->t.t_implicit_task_taskdata[0].td_icvs);
+          }
+          ngo_sync();
+        }
+      }
+#endif // KMP_BARRIER_ICV_PUSH
+
+      // Now, release all of the worker threads
+      for (i = 1; i < nproc; ++i) {
+#if KMP_CACHE_MANAGE
+        // Prefetch next thread's go flag
+        if (i + 1 < nproc)
+          KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+        KA_TRACE(
+            20,
+            ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
+             "go(%p): %u => %u\n",
+             gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
+             team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
+             other_threads[i]->th.th_bar[bt].bb.b_go,
+             other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
+        ANNOTATE_BARRIER_BEGIN(other_threads[i]);
+        kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
+                         other_threads[i]);
+        flag.release();
+      }
+    }
+  } else { // Wait for the MASTER thread to release us
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
+                  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+    if (cancellable) {
+      bool cancelled = flag.wait_cancellable_nosleep(
+          this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+      if (cancelled) {
+        return true;
+      }
+    } else {
+      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    ANNOTATE_BARRIER_END(this_thr);
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+      // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
+      // disabled)
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+      // Cancel wait on previous parallel region...
+      __kmp_itt_task_starting(itt_sync_obj);
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return false;
+
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      if (itt_sync_obj != NULL)
+        // Call prepare as early as possible for "new" barrier
+        __kmp_itt_task_finished(itt_sync_obj);
+    } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return false;
+// The worker thread may now assume that the team is valid.
+#ifdef KMP_DEBUG
+    tid = __kmp_tid_from_gtid(gtid);
+    team = __kmp_threads[gtid]->th.th_team;
+#endif
+    KMP_DEBUG_ASSERT(team != NULL);
+    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+    KA_TRACE(20,
+             ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+  KA_TRACE(
+      20,
+      ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  return false;
+}
+
+static void __kmp_linear_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  __kmp_linear_barrier_gather_template<false>(
+      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+static bool __kmp_linear_barrier_gather_cancellable(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  return __kmp_linear_barrier_gather_template<true>(
+      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+static void __kmp_linear_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  __kmp_linear_barrier_release_template<false>(
+      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+static bool __kmp_linear_barrier_release_cancellable(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  return __kmp_linear_barrier_release_template<true>(
+      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+// Tree barrier
+static void
+__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
+                          int tid, void (*reduce)(void *, void *)
+                                       USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads = team->t.t_threads;
+  kmp_uint32 nproc = this_thr->th.th_team_nproc;
+  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 child;
+  kmp_uint32 child_tid;
+  kmp_uint64 new_state;
+
+  KA_TRACE(
+      20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+  // Perform tree gather to wait until all threads have arrived; reduce any
+  // required data as we go
+  child_tid = (tid << branch_bits) + 1;
+  if (child_tid < nproc) {
+    // Parent threads wait for all their children to arrive
+    new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+    child = 1;
+    do {
+      kmp_info_t *child_thr = other_threads[child_tid];
+      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+      // Prefetch next thread's arrived count
+      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
+        KMP_CACHE_PREFETCH(
+            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+      KA_TRACE(20,
+               ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                "arrived(%p) == %llu\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+      // Wait for child to arrive
+      kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+      flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+      ANNOTATE_BARRIER_END(child_thr);
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier imbalance - write min of the thread time and a child time to
+      // the thread.
+      if (__kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                               child_thr->th.th_bar_min_time);
+      }
+#endif
+      if (reduce) {
+        KA_TRACE(100,
+                 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                  team->t.t_id, child_tid));
+        ANNOTATE_REDUCE_AFTER(reduce);
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  child_thr->th.th_local.reduce_data);
+        ANNOTATE_REDUCE_BEFORE(reduce);
+        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
+      }
+      child++;
+      child_tid++;
+    } while (child <= branch_factor && child_tid < nproc);
+  }
+
+  if (!KMP_MASTER_TID(tid)) { // Worker threads
+    kmp_int32 parent_tid = (tid - 1) >> branch_bits;
+
+    KA_TRACE(20,
+             ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+              "arrived(%p): %llu => %llu\n",
+              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
+              team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
+              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+
+    // Mark arrival to parent thread
+    /* After performing this write, a worker thread may not assume that the team
+       is valid any more - it could be deallocated by the master thread at any
+       time.  */
+    ANNOTATE_BARRIER_BEGIN(this_thr);
+    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
+    flag.release();
+  } else {
+    // Need to update the team arrived pointer if we are the master thread
+    if (nproc > 1) // New value was already computed above
+      team->t.t_bar[bt].b_arrived = new_state;
+    else
+      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id,
+                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+  }
+  KA_TRACE(20,
+           ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+            gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_tree_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
+  kmp_team_t *team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_uint32 nproc;
+  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 child;
+  kmp_uint32 child_tid;
+
+  // Perform a tree release for all of the threads that have been gathered
+  if (!KMP_MASTER_TID(
+          tid)) { // Handle fork barrier workers who aren't part of a team yet
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
+                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+    // Wait for parent thread to release us
+    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    ANNOTATE_BARRIER_END(this_thr);
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+      // In fork barrier where we could not get the object reliably (or
+      // ITTNOTIFY is disabled)
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+      // Cancel wait on previous parallel region...
+      __kmp_itt_task_starting(itt_sync_obj);
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      if (itt_sync_obj != NULL)
+        // Call prepare as early as possible for "new" barrier
+        __kmp_itt_task_finished(itt_sync_obj);
+    } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+
+    // The worker thread may now assume that the team is valid.
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+    KA_TRACE(20,
+             ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
+              team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  } else {
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
+                  "barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+  }
+  nproc = this_thr->th.th_team_nproc;
+  child_tid = (tid << branch_bits) + 1;
+
+  if (child_tid < nproc) {
+    kmp_info_t **other_threads = team->t.t_threads;
+    child = 1;
+    // Parent threads release all their children
+    do {
+      kmp_info_t *child_thr = other_threads[child_tid];
+      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+      // Prefetch next thread's go count
+      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
+        KMP_CACHE_PREFETCH(
+            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+      {
+        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
+        if (propagate_icvs) {
+          __kmp_init_implicit_task(team->t.t_ident,
+                                   team->t.t_threads[child_tid], team,
+                                   child_tid, FALSE);
+          copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
+                    &team->t.t_implicit_task_taskdata[0].td_icvs);
+        }
+      }
+#endif // KMP_BARRIER_ICV_PUSH
+      KA_TRACE(20,
+               ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+                "go(%p): %u => %u\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+      // Release child from barrier
+      ANNOTATE_BARRIER_BEGIN(child_thr);
+      kmp_flag_64 flag(&child_bar->b_go, child_thr);
+      flag.release();
+      child++;
+      child_tid++;
+    } while (child <= branch_factor && child_tid < nproc);
+  }
+  KA_TRACE(
+      20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+}
+
+// Hyper Barrier
+static void
+__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
+                           int tid, void (*reduce)(void *, void *)
+                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads = team->t.t_threads;
+  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
+  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
+  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 offset;
+  kmp_uint32 level;
+
+  KA_TRACE(
+      20,
+      ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
+        __itt_get_timestamp();
+  }
+#endif
+  /* Perform a hypercube-embedded tree gather to wait until all of the threads
+     have arrived, and reduce any required data as we go.  */
+  kmp_flag_64 p_flag(&thr_bar->b_arrived);
+  for (level = 0, offset = 1; offset < num_threads;
+       level += branch_bits, offset <<= branch_bits) {
+    kmp_uint32 child;
+    kmp_uint32 child_tid;
+
+    if (((tid >> level) & (branch_factor - 1)) != 0) {
+      kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
+
+      KA_TRACE(20,
+               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                "arrived(%p): %llu => %llu\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
+                team->t.t_id, parent_tid, &thr_bar->b_arrived,
+                thr_bar->b_arrived,
+                thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+      // Mark arrival to parent thread
+      /* After performing this write (in the last iteration of the enclosing for
+         loop), a worker thread may not assume that the team is valid any more
+         - it could be deallocated by the master thread at any time.  */
+      ANNOTATE_BARRIER_BEGIN(this_thr);
+      p_flag.set_waiter(other_threads[parent_tid]);
+      p_flag.release();
+      break;
+    }
+
+    // Parent threads wait for children to arrive
+    if (new_state == KMP_BARRIER_UNUSED_STATE)
+      new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+    for (child = 1, child_tid = tid + (1 << level);
+         child < branch_factor && child_tid < num_threads;
+         child++, child_tid += (1 << level)) {
+      kmp_info_t *child_thr = other_threads[child_tid];
+      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+      kmp_uint32 next_child_tid = child_tid + (1 << level);
+      // Prefetch next thread's arrived count
+      if (child + 1 < branch_factor && next_child_tid < num_threads)
+        KMP_CACHE_PREFETCH(
+            &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+      KA_TRACE(20,
+               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                "arrived(%p) == %llu\n",
+                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+      // Wait for child to arrive
+      kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
+      c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+      ANNOTATE_BARRIER_END(child_thr);
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier imbalance - write min of the thread time and a child time to
+      // the thread.
+      if (__kmp_forkjoin_frames_mode == 2) {
+        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                               child_thr->th.th_bar_min_time);
+      }
+#endif
+      if (reduce) {
+        KA_TRACE(100,
+                 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                  team->t.t_id, child_tid));
+        ANNOTATE_REDUCE_AFTER(reduce);
+        (*reduce)(this_thr->th.th_local.reduce_data,
+                  child_thr->th.th_local.reduce_data);
+        ANNOTATE_REDUCE_BEFORE(reduce);
+        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
+      }
+    }
+  }
+
+  if (KMP_MASTER_TID(tid)) {
+    // Need to update the team arrived pointer if we are the master thread
+    if (new_state == KMP_BARRIER_UNUSED_STATE)
+      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+    else
+      team->t.t_bar[bt].b_arrived = new_state;
+    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id,
+                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+  }
+  KA_TRACE(
+      20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+           gtid, team->t.t_id, tid, bt));
+}
+
+// The reverse versions seem to beat the forward versions overall
+#define KMP_REVERSE_HYPER_BAR
+static void __kmp_hyper_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
+  kmp_team_t *team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_info_t **other_threads;
+  kmp_uint32 num_threads;
+  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
+  kmp_uint32 branch_factor = 1 << branch_bits;
+  kmp_uint32 child;
+  kmp_uint32 child_tid;
+  kmp_uint32 offset;
+  kmp_uint32 level;
+
+  /* Perform a hypercube-embedded tree release for all of the threads that have
+     been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
+     are released in the reverse order of the corresponding gather, otherwise
+     threads are released in the same order. */
+  if (KMP_MASTER_TID(tid)) { // master
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
+                  "barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) { // master already has ICVs in final destination; copy
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    }
+#endif
+  } else { // Handle fork barrier workers who aren't part of a team yet
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
+                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+    // Wait for parent thread to release us
+    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    ANNOTATE_BARRIER_END(this_thr);
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+      // In fork barrier where we could not get the object reliably
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+      // Cancel wait on previous parallel region...
+      __kmp_itt_task_starting(itt_sync_obj);
+
+      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+        return;
+
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      if (itt_sync_obj != NULL)
+        // Call prepare as early as possible for "new" barrier
+        __kmp_itt_task_finished(itt_sync_obj);
+    } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+
+    // The worker thread may now assume that the team is valid.
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+    KA_TRACE(20,
+             ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+  num_threads = this_thr->th.th_team_nproc;
+  other_threads = team->t.t_threads;
+
+#ifdef KMP_REVERSE_HYPER_BAR
+  // Count up to correct level for parent
+  for (level = 0, offset = 1;
+       offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
+       level += branch_bits, offset <<= branch_bits)
+    ;
+
+  // Now go down from there
+  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
+       level -= branch_bits, offset >>= branch_bits)
+#else
+  // Go down the tree, level by level
+  for (level = 0, offset = 1; offset < num_threads;
+       level += branch_bits, offset <<= branch_bits)
+#endif // KMP_REVERSE_HYPER_BAR
+  {
+#ifdef KMP_REVERSE_HYPER_BAR
+    /* Now go in reverse order through the children, highest to lowest.
+       Initial setting of child is conservative here. */
+    child = num_threads >> ((level == 0) ? level : level - 1);
+    for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
+        child_tid = tid + (child << level);
+         child >= 1; child--, child_tid -= (1 << level))
+#else
+    if (((tid >> level) & (branch_factor - 1)) != 0)
+      // No need to go lower than this, since this is the level parent would be
+      // notified
+      break;
+    // Iterate through children on this level of the tree
+    for (child = 1, child_tid = tid + (1 << level);
+         child < branch_factor && child_tid < num_threads;
+         child++, child_tid += (1 << level))
+#endif // KMP_REVERSE_HYPER_BAR
+    {
+      if (child_tid >= num_threads)
+        continue; // Child doesn't exist so keep going
+      else {
+        kmp_info_t *child_thr = other_threads[child_tid];
+        kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+        kmp_uint32 next_child_tid = child_tid - (1 << level);
+// Prefetch next thread's go count
+#ifdef KMP_REVERSE_HYPER_BAR
+        if (child - 1 >= 1 && next_child_tid < num_threads)
+#else
+        if (child + 1 < branch_factor && next_child_tid < num_threads)
+#endif // KMP_REVERSE_HYPER_BAR
+          KMP_CACHE_PREFETCH(
+              &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+        if (propagate_icvs) // push my fixed ICVs to my child
+          copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+
+        KA_TRACE(
+            20,
+            ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+             "go(%p): %u => %u\n",
+             gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+             team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+             child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+        // Release child from barrier
+        ANNOTATE_BARRIER_BEGIN(child_thr);
+        kmp_flag_64 flag(&child_bar->b_go, child_thr);
+        flag.release();
+      }
+    }
+  }
+#if KMP_BARRIER_ICV_PUSH
+  if (propagate_icvs &&
+      !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
+    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
+                             FALSE);
+    copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+              &thr_bar->th_fixed_icvs);
+  }
+#endif
+  KA_TRACE(
+      20,
+      ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+       gtid, team->t.t_id, tid, bt));
+}
+
+// Hierarchical Barrier
+
+// Initialize thread barrier data
+/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
+   Performs the minimum amount of initialization required based on how the team
+   has changed. Returns true if leaf children will require both on-core and
+   traditional wake-up mechanisms. For example, if the team size increases,
+   threads already in the team will respond to on-core wakeup on their parent
+   thread, but threads newly added to the team will only be listening on the
+   their local b_go. */
+static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
+                                                   kmp_bstate_t *thr_bar,
+                                                   kmp_uint32 nproc, int gtid,
+                                                   int tid, kmp_team_t *team) {
+  // Checks to determine if (re-)initialization is needed
+  bool uninitialized = thr_bar->team == NULL;
+  bool team_changed = team != thr_bar->team;
+  bool team_sz_changed = nproc != thr_bar->nproc;
+  bool tid_changed = tid != thr_bar->old_tid;
+  bool retval = false;
+
+  if (uninitialized || team_sz_changed) {
+    __kmp_get_hierarchy(nproc, thr_bar);
+  }
+
+  if (uninitialized || team_sz_changed || tid_changed) {
+    thr_bar->my_level = thr_bar->depth - 1; // default for master
+    thr_bar->parent_tid = -1; // default for master
+    if (!KMP_MASTER_TID(
+            tid)) { // if not master, find parent thread in hierarchy
+      kmp_uint32 d = 0;
+      while (d < thr_bar->depth) { // find parent based on level of thread in
+        // hierarchy, and note level
+        kmp_uint32 rem;
+        if (d == thr_bar->depth - 2) { // reached level right below the master
+          thr_bar->parent_tid = 0;
+          thr_bar->my_level = d;
+          break;
+        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
+                   0) { // TODO: can we make this op faster?
+          // thread is not a subtree root at next level, so this is max
+          thr_bar->parent_tid = tid - rem;
+          thr_bar->my_level = d;
+          break;
+        }
+        ++d;
+      }
+    }
+    thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
+    thr_bar->old_tid = tid;
+    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+    thr_bar->team = team;
+    thr_bar->parent_bar =
+        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
+  }
+  if (uninitialized || team_changed || tid_changed) {
+    thr_bar->team = team;
+    thr_bar->parent_bar =
+        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
+    retval = true;
+  }
+  if (uninitialized || team_sz_changed || tid_changed) {
+    thr_bar->nproc = nproc;
+    thr_bar->leaf_kids = thr_bar->base_leaf_kids;
+    if (thr_bar->my_level == 0)
+      thr_bar->leaf_kids = 0;
+    if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
+      thr_bar->leaf_kids = nproc - tid - 1;
+    thr_bar->leaf_state = 0;
+    for (int i = 0; i < thr_bar->leaf_kids; ++i)
+      ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
+  }
+  return retval;
+}
+
+static void __kmp_hierarchical_barrier_gather(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
+  kmp_team_t *team = this_thr->th.th_team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_uint32 nproc = this_thr->th.th_team_nproc;
+  kmp_info_t **other_threads = team->t.t_threads;
+  kmp_uint64 new_state;
+
+  int level = team->t.t_level;
+  if (other_threads[0]
+          ->th.th_teams_microtask) // are we inside the teams construct?
+    if (this_thr->th.th_teams_size.nteams > 1)
+      ++level; // level was not increased in teams construct for team_of_masters
+  if (level == 1)
+    thr_bar->use_oncore_barrier = 1;
+  else
+    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
+
+  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  // Barrier imbalance - save arrive time to the thread
+  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
+    this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
+  }
+#endif
+
+  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
+                                               team);
+
+  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
+    kmp_int32 child_tid;
+    new_state =
+        (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+        thr_bar->use_oncore_barrier) {
+      if (thr_bar->leaf_kids) {
+        // First, wait for leaf children to check-in on my b_arrived flag
+        kmp_uint64 leaf_state =
+            KMP_MASTER_TID(tid)
+                ? thr_bar->b_arrived | thr_bar->leaf_state
+                : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
+                      "for leaf kids\n",
+                      gtid, team->t.t_id, tid));
+        kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
+        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+        if (reduce) {
+          ANNOTATE_REDUCE_AFTER(reduce);
+          for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
+               ++child_tid) {
+            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
+                           "T#%d(%d:%d)\n",
+                           gtid, team->t.t_id, tid,
+                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                           child_tid));
+            ANNOTATE_BARRIER_END(other_threads[child_tid]);
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      other_threads[child_tid]->th.th_local.reduce_data);
+          }
+          ANNOTATE_REDUCE_BEFORE(reduce);
+          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
+        }
+        // clear leaf_state bits
+        KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
+      }
+      // Next, wait for higher level children on each child's b_arrived flag
+      for (kmp_uint32 d = 1; d < thr_bar->my_level;
+           ++d) { // gather lowest level threads first, but skip 0
+        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
+                   skip = thr_bar->skip_per_level[d];
+        if (last > nproc)
+          last = nproc;
+        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
+          kmp_info_t *child_thr = other_threads[child_tid];
+          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
+                        "T#%d(%d:%d) "
+                        "arrived(%p) == %llu\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_arrived, new_state));
+          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          ANNOTATE_BARRIER_END(child_thr);
+          if (reduce) {
+            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
+                           "T#%d(%d:%d)\n",
+                           gtid, team->t.t_id, tid,
+                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                           child_tid));
+            ANNOTATE_REDUCE_AFTER(reduce);
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      child_thr->th.th_local.reduce_data);
+            ANNOTATE_REDUCE_BEFORE(reduce);
+            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
+          }
+        }
+      }
+    } else { // Blocktime is not infinite
+      for (kmp_uint32 d = 0; d < thr_bar->my_level;
+           ++d) { // Gather lowest level threads first
+        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
+                   skip = thr_bar->skip_per_level[d];
+        if (last > nproc)
+          last = nproc;
+        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
+          kmp_info_t *child_thr = other_threads[child_tid];
+          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
+                        "T#%d(%d:%d) "
+                        "arrived(%p) == %llu\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_arrived, new_state));
+          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          ANNOTATE_BARRIER_END(child_thr);
+          if (reduce) {
+            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
+                           "T#%d(%d:%d)\n",
+                           gtid, team->t.t_id, tid,
+                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                           child_tid));
+            ANNOTATE_REDUCE_AFTER(reduce);
+            (*reduce)(this_thr->th.th_local.reduce_data,
+                      child_thr->th.th_local.reduce_data);
+            ANNOTATE_REDUCE_BEFORE(reduce);
+            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
+          }
+        }
+      }
+    }
+  }
+  // All subordinates are gathered; now release parent if not master thread
+
+  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
+                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
+                  gtid, team->t.t_id, tid,
+                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
+                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
+                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+    /* Mark arrival to parent: After performing this write, a worker thread may
+       not assume that the team is valid any more - it could be deallocated by
+       the master thread at any time. */
+    if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
+        !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
+      // flag; release it
+      ANNOTATE_BARRIER_BEGIN(this_thr);
+      kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
+      flag.release();
+    } else {
+      // Leaf does special release on "offset" bits of parent's b_arrived flag
+      thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
+      flag.set_waiter(other_threads[thr_bar->parent_tid]);
+      flag.release();
+    }
+  } else { // Master thread needs to update the team's b_arrived value
+    team->t.t_bar[bt].b_arrived = new_state;
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
+                  "arrived(%p) = %llu\n",
+                  gtid, team->t.t_id, tid, team->t.t_id,
+                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+  }
+  // Is the team access below unsafe or just technically invalid?
+  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+}
+
+static void __kmp_hierarchical_barrier_release(
+    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
+  kmp_team_t *team;
+  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+  kmp_uint32 nproc;
+  bool team_change = false; // indicates on-core barrier shouldn't be used
+
+  if (KMP_MASTER_TID(tid)) {
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
+                  "entered barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+  } else { // Worker threads
+    // Wait for parent thread to release me
+    if (!thr_bar->use_oncore_barrier ||
+        __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
+        thr_bar->team == NULL) {
+      // Use traditional method of waiting on my own b_go flag
+      thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
+      kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+      ANNOTATE_BARRIER_END(this_thr);
+      TCW_8(thr_bar->b_go,
+            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+    } else { // Thread barrier data is initialized, this is a leaf, blocktime is
+      // infinite, not nested
+      // Wait on my "offset" bits on parent's b_go flag
+      thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
+      kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
+                           thr_bar->offset, bt,
+                           this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
+      flag.wait(this_thr, TRUE);
+      if (thr_bar->wait_flag ==
+          KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
+        TCW_8(thr_bar->b_go,
+              KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+      } else { // Reset my bits on parent's b_go flag
+        (RCAST(volatile char *,
+               &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
+      }
+    }
+    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+    // Early exit for reaping threads releasing forkjoin barrier
+    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+      return;
+    // The worker thread may now assume that the team is valid.
+    team = __kmp_threads[gtid]->th.th_team;
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+    KA_TRACE(
+        20,
+        ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+         gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+
+  nproc = this_thr->th.th_team_nproc;
+  int level = team->t.t_level;
+  if (team->t.t_threads[0]
+          ->th.th_teams_microtask) { // are we inside the teams construct?
+    if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+        this_thr->th.th_teams_level == level)
+      ++level; // level was not increased in teams construct for team_of_workers
+    if (this_thr->th.th_teams_size.nteams > 1)
+      ++level; // level was not increased in teams construct for team_of_masters
+  }
+  if (level == 1)
+    thr_bar->use_oncore_barrier = 1;
+  else
+    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
+
+  // If the team size has increased, we still communicate with old leaves via
+  // oncore barrier.
+  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
+  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
+  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
+                                                       tid, team);
+  // But if the entire team changes, we won't use oncore barrier at all
+  if (team_change)
+    old_leaf_kids = 0;
+
+#if KMP_BARRIER_ICV_PUSH
+  if (propagate_icvs) {
+    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
+                             FALSE);
+    if (KMP_MASTER_TID(
+            tid)) { // master already has copy in final destination; copy
+      copy_icvs(&thr_bar->th_fixed_icvs,
+                &team->t.t_implicit_task_taskdata[tid].td_icvs);
+    } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+               thr_bar->use_oncore_barrier) { // optimization for inf blocktime
+      if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
+        // leaves (on-core children) pull parent's fixed ICVs directly to local
+        // ICV store
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                  &thr_bar->parent_bar->th_fixed_icvs);
+      // non-leaves will get ICVs piggybacked with b_go via NGO store
+    } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
+      if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
+        // access
+        copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
+      else // leaves copy parent's fixed ICVs directly to local ICV store
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                  &thr_bar->parent_bar->th_fixed_icvs);
+    }
+  }
+#endif // KMP_BARRIER_ICV_PUSH
+
+  // Now, release my children
+  if (thr_bar->my_level) { // not a leaf
+    kmp_int32 child_tid;
+    kmp_uint32 last;
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+        thr_bar->use_oncore_barrier) {
+      if (KMP_MASTER_TID(tid)) { // do a flat release
+        // Set local b_go to bump children via NGO store of the cache line
+        // containing IVCs and b_go.
+        thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
+        // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
+        // the cache line
+        ngo_load(&thr_bar->th_fixed_icvs);
+        // This loops over all the threads skipping only the leaf nodes in the
+        // hierarchy
+        for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
+             child_tid += thr_bar->skip_per_level[1]) {
+          kmp_bstate_t *child_bar =
+              &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
+                        "releasing T#%d(%d:%d)"
+                        " go(%p): %u => %u\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_go, child_bar->b_go,
+                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+          // Use ngo store (if available) to both store ICVs and release child
+          // via child's b_go
+          ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+        }
+        ngo_sync();
+      }
+      TCW_8(thr_bar->b_go,
+            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+      // Now, release leaf children
+      if (thr_bar->leaf_kids) { // if there are any
+        // We test team_change on the off-chance that the level 1 team changed.
+        if (team_change ||
+            old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
+          if (old_leaf_kids) { // release old leaf kids
+            thr_bar->b_go |= old_leaf_state;
+          }
+          // Release new leaf kids
+          last = tid + thr_bar->skip_per_level[1];
+          if (last > nproc)
+            last = nproc;
+          for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
+               ++child_tid) { // skip_per_level[0]=1
+            kmp_info_t *child_thr = team->t.t_threads[child_tid];
+            kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+            KA_TRACE(
+                20,
+                ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
+                 " T#%d(%d:%d) go(%p): %u => %u\n",
+                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+            // Release child using child's b_go flag
+            ANNOTATE_BARRIER_BEGIN(child_thr);
+            kmp_flag_64 flag(&child_bar->b_go, child_thr);
+            flag.release();
+          }
+        } else { // Release all children at once with leaf_state bits on my own
+          // b_go flag
+          thr_bar->b_go |= thr_bar->leaf_state;
+        }
+      }
+    } else { // Blocktime is not infinite; do a simple hierarchical release
+      for (int d = thr_bar->my_level - 1; d >= 0;
+           --d) { // Release highest level threads first
+        last = tid + thr_bar->skip_per_level[d + 1];
+        kmp_uint32 skip = thr_bar->skip_per_level[d];
+        if (last > nproc)
+          last = nproc;
+        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
+          kmp_info_t *child_thr = team->t.t_threads[child_tid];
+          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
+                        "releasing T#%d(%d:%d) go(%p): %u => %u\n",
+                        gtid, team->t.t_id, tid,
+                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                        child_tid, &child_bar->b_go, child_bar->b_go,
+                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+          // Release child using child's b_go flag
+          ANNOTATE_BARRIER_BEGIN(child_thr);
+          kmp_flag_64 flag(&child_bar->b_go, child_thr);
+          flag.release();
+        }
+      }
+    }
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs && !KMP_MASTER_TID(tid))
+      // non-leaves copy ICVs from fixed ICVs to local dest
+      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+  }
+  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
+                "barrier type %d\n",
+                gtid, team->t.t_id, tid, bt));
+}
+
+// End of Barrier Algorithms
+
+// type traits for cancellable value
+// if cancellable is true, then is_cancellable is a normal boolean variable
+// if cancellable is false, then is_cancellable is a compile time constant
+template <bool cancellable> struct is_cancellable {};
+template <> struct is_cancellable<true> {
+  bool value;
+  is_cancellable() : value(false) {}
+  is_cancellable(bool b) : value(b) {}
+  is_cancellable &operator=(bool b) {
+    value = b;
+    return *this;
+  }
+  operator bool() const { return value; }
+};
+template <> struct is_cancellable<false> {
+  is_cancellable &operator=(bool b) { return *this; }
+  constexpr operator bool() const { return false; }
+};
+
+// Internal function to do a barrier.
+/* If is_split is true, do a split barrier, otherwise, do a plain barrier
+   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
+   barrier
+   When cancellable = false,
+     Returns 0 if master thread, 1 if worker thread.
+   When cancellable = true
+     Returns 0 if not cancelled, 1 if cancelled.  */
+template <bool cancellable = false>
+static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
+                                  size_t reduce_size, void *reduce_data,
+                                  void (*reduce)(void *, void *)) {
+  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int status = 0;
+  is_cancellable<cancellable> cancelled;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_data_t *my_task_data;
+  ompt_data_t *my_parallel_data;
+  void *return_address;
+  ompt_sync_region_t barrier_kind;
+#endif
+
+  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+
+  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
+          return_address);
+    }
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
+          return_address);
+    }
+#endif
+    // It is OK to report the barrier state after the barrier begin callback.
+    // According to the OMPT specification, a compliant implementation may
+    // even delay reporting this state until the barrier begins to wait.
+    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+  }
+#endif
+
+  if (!team->t.t_serialized) {
+#if USE_ITT_BUILD
+    // This value will be used in itt notify events below.
+    void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+#endif
+#endif /* USE_ITT_BUILD */
+    if (__kmp_tasking_mode == tskm_extra_barrier) {
+      __kmp_tasking_barrier(team, this_thr, gtid);
+      KA_TRACE(15,
+               ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
+                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+    }
+
+    /* Copy the blocktime info to the thread, where __kmp_wait_template() can
+       access it when the team struct is not guaranteed to exist. */
+    // See note about the corresponding code in __kmp_join_barrier() being
+    // performance-critical.
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+#if KMP_USE_MONITOR
+      this_thr->th.th_team_bt_intervals =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+      this_thr->th.th_team_bt_set =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+#else
+      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
+#endif
+    }
+
+#if USE_ITT_BUILD
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+#if USE_DEBUGGER
+    // Let the debugger know: the thread arrived to the barrier and waiting.
+    if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
+      team->t.t_bar[bt].b_master_arrived += 1;
+    } else {
+      this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
+    } // if
+#endif /* USE_DEBUGGER */
+    if (reduce != NULL) {
+      // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
+      this_thr->th.th_local.reduce_data = reduce_data;
+    }
+
+    if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
+      // use 0 to only setup the current team if nthreads > 1
+      __kmp_task_team_setup(this_thr, team, 0);
+
+    if (cancellable) {
+      cancelled = __kmp_linear_barrier_gather_cancellable(
+          bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+    } else {
+      switch (__kmp_barrier_gather_pattern[bt]) {
+      case bp_hyper_bar: {
+        // don't set branch bits to 0; use linear
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
+        __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
+                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
+      case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_gather(
+            bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
+      case bp_tree_bar: {
+        // don't set branch bits to 0; use linear
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
+        __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
+                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+        break;
+      }
+      default: {
+        __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
+                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
+      }
+      }
+    }
+
+    KMP_MB();
+
+    if (KMP_MASTER_TID(tid)) {
+      status = 0;
+      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
+        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
+      }
+#if USE_DEBUGGER
+      // Let the debugger know: All threads are arrived and starting leaving the
+      // barrier.
+      team->t.t_bar[bt].b_team_arrived += 1;
+#endif
+
+      if (__kmp_omp_cancellation) {
+        kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
+        // Reset cancellation flag for worksharing constructs
+        if (cancel_request == cancel_loop ||
+            cancel_request == cancel_sections) {
+          KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
+        }
+      }
+#if USE_ITT_BUILD
+      /* TODO: In case of split reduction barrier, master thread may send
+         acquired event early, before the final summation into the shared
+         variable is done (final summation can be a long operation for array
+         reductions).  */
+      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      // Barrier - report frame end (only if active_level == 1)
+      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+          __kmp_forkjoin_frames_mode &&
+          this_thr->th.th_teams_microtask == NULL &&
+          team->t.t_active_level == 1) {
+        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
+        kmp_uint64 cur_time = __itt_get_timestamp();
+        kmp_info_t **other_threads = team->t.t_threads;
+        int nproc = this_thr->th.th_team_nproc;
+        int i;
+        switch (__kmp_forkjoin_frames_mode) {
+        case 1:
+          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                                 loc, nproc);
+          this_thr->th.th_frame_time = cur_time;
+          break;
+        case 2: // AC 2015-01-19: currently does not work for hierarchical (to
+          // be fixed)
+          __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
+                                 1, loc, nproc);
+          break;
+        case 3:
+          if (__itt_metadata_add_ptr) {
+            // Initialize with master's wait time
+            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+            // Set arrive time to zero to be able to check it in
+            // __kmp_invoke_task(); the same is done inside the loop below
+            this_thr->th.th_bar_arrive_time = 0;
+            for (i = 1; i < nproc; ++i) {
+              delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
+              other_threads[i]->th.th_bar_arrive_time = 0;
+            }
+            __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
+                                         cur_time, delta,
+                                         (kmp_uint64)(reduce != NULL));
+          }
+          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                                 loc, nproc);
+          this_thr->th.th_frame_time = cur_time;
+          break;
+        }
+      }
+#endif /* USE_ITT_BUILD */
+    } else {
+      status = 1;
+#if USE_ITT_BUILD
+      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+    }
+    if ((status == 1 || !is_split) && !cancelled) {
+      if (cancellable) {
+        cancelled = __kmp_linear_barrier_release_cancellable(
+            bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+      } else {
+        switch (__kmp_barrier_release_pattern[bt]) {
+        case bp_hyper_bar: {
+          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+          __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
+                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
+        case bp_hierarchical_bar: {
+          __kmp_hierarchical_barrier_release(
+              bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
+        case bp_tree_bar: {
+          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+          __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
+                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+          break;
+        }
+        default: {
+          __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
+                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
+        }
+        }
+      }
+      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
+        __kmp_task_team_sync(this_thr, team);
+      }
+    }
+
+#if USE_ITT_BUILD
+    /* GEH: TODO: Move this under if-condition above and also include in
+       __kmp_end_split_barrier(). This will more accurately represent the actual
+       release time of the threads for split barriers.  */
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+  } else { // Team is serialized.
+    status = 0;
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      if (this_thr->th.th_task_team != NULL) {
+#if USE_ITT_NOTIFY
+        void *itt_sync_obj = NULL;
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+          itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+          __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+        }
+#endif
+
+        KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
+                         TRUE);
+        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
+        __kmp_task_team_setup(this_thr, team, 0);
+
+#if USE_ITT_BUILD
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+      }
+    }
+  }
+  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
+                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
+                __kmp_tid_from_gtid(gtid), status));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
+          return_address);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
+          return_address);
+    }
+#endif
+    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+  }
+#endif
+  ANNOTATE_BARRIER_END(&team->t.t_bar);
+
+  if (cancellable)
+    return (int)cancelled;
+  return status;
+}
+
+// Returns 0 if master thread, 1 if worker thread.
+int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
+                  size_t reduce_size, void *reduce_data,
+                  void (*reduce)(void *, void *)) {
+  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
+                                  reduce);
+}
+
+#if defined(KMP_GOMP_COMPAT)
+// Returns 1 if cancelled, 0 otherwise
+int __kmp_barrier_gomp_cancel(int gtid) {
+  if (__kmp_omp_cancellation) {
+    int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
+                                                 0, NULL, NULL);
+    if (cancelled) {
+      int tid = __kmp_tid_from_gtid(gtid);
+      kmp_info_t *this_thr = __kmp_threads[gtid];
+      if (KMP_MASTER_TID(tid)) {
+        // Master does not need to revert anything
+      } else {
+        // Workers need to revert their private b_arrived flag
+        this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
+            KMP_BARRIER_STATE_BUMP;
+      }
+    }
+    return cancelled;
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  return FALSE;
+}
+#endif
+
+void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+
+  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
+  if (!team->t.t_serialized) {
+    if (KMP_MASTER_GTID(gtid)) {
+      switch (__kmp_barrier_release_pattern[bt]) {
+      case bp_hyper_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+        __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
+                                    FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
+      case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
+                                           FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
+      case bp_tree_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+        __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
+                                   FALSE USE_ITT_BUILD_ARG(NULL));
+        break;
+      }
+      default: {
+        __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
+                                     FALSE USE_ITT_BUILD_ARG(NULL));
+      }
+      }
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        __kmp_task_team_sync(this_thr, team);
+      } // if
+    }
+  }
+  ANNOTATE_BARRIER_END(&team->t.t_bar);
+}
+
+void __kmp_join_barrier(int gtid) {
+  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team;
+  kmp_uint nproc;
+  kmp_info_t *master_thread;
+  int tid;
+#ifdef KMP_DEBUG
+  int team_id;
+#endif /* KMP_DEBUG */
+#if USE_ITT_BUILD
+  void *itt_sync_obj = NULL;
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
+    // Get object created at fork_barrier
+    itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+#endif
+#endif /* USE_ITT_BUILD */
+  KMP_MB();
+
+  // Get current info
+  team = this_thr->th.th_team;
+  nproc = this_thr->th.th_team_nproc;
+  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
+  tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+  team_id = team->t.t_id;
+#endif /* KMP_DEBUG */
+  master_thread = this_thr->th.th_team_master;
+#ifdef KMP_DEBUG
+  if (master_thread != team->t.t_threads[0]) {
+    __kmp_print_structure();
+  }
+#endif /* KMP_DEBUG */
+  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
+  KMP_MB();
+
+  // Verify state
+  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
+  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
+  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
+  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
+                gtid, team_id, tid));
+
+  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+#if OMPT_OPTIONAL
+    ompt_data_t *my_task_data;
+    ompt_data_t *my_parallel_data;
+    void *codeptr = NULL;
+    int ds_tid = this_thr->th.th_info.ds.ds_tid;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = team->t.ompt_team_info.master_return_address;
+    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
+          my_task_data, codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
+          my_task_data, codeptr);
+    }
+    if (!KMP_MASTER_TID(ds_tid))
+      this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
+#endif
+    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
+  }
+#endif
+
+  if (__kmp_tasking_mode == tskm_extra_barrier) {
+    __kmp_tasking_barrier(team, this_thr, gtid);
+    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
+                  team_id, tid));
+  }
+#ifdef KMP_DEBUG
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
+                  "%p, th_task_team = %p\n",
+                  __kmp_gtid_from_thread(this_thr), team_id,
+                  team->t.t_task_team[this_thr->th.th_task_state],
+                  this_thr->th.th_task_team));
+    KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
+                     team->t.t_task_team[this_thr->th.th_task_state]);
+  }
+#endif /* KMP_DEBUG */
+
+  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
+     access it when the team struct is not guaranteed to exist. Doing these
+     loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
+     we do not perform the copy if blocktime=infinite, since the values are not
+     used by __kmp_wait_template() in that case. */
+  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+#if KMP_USE_MONITOR
+    this_thr->th.th_team_bt_intervals =
+        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+    this_thr->th.th_team_bt_set =
+        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+#else
+    this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
+#endif
+  }
+
+#if USE_ITT_BUILD
+  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
+  case bp_hyper_bar: {
+    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+    __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_hierarchical_bar: {
+    __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                      NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_tree_bar: {
+    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+    __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  default: {
+    __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  }
+
+  /* From this point on, the team data structure may be deallocated at any time
+     by the master thread - it is unsafe to reference it in any of the worker
+     threads. Any per-team data items that need to be referenced before the
+     end of the barrier should be moved to the kmp_task_team_t structs.  */
+  if (KMP_MASTER_TID(tid)) {
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    if (__kmp_display_affinity) {
+      KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
+    }
+#if KMP_STATS_ENABLED
+    // Have master thread flag the workers to indicate they are now waiting for
+    // next parallel region, Also wake them up so they switch their timers to
+    // idle.
+    for (int i = 0; i < team->t.t_nproc; ++i) {
+      kmp_info_t *team_thread = team->t.t_threads[i];
+      if (team_thread == this_thr)
+        continue;
+      team_thread->th.th_stats->setIdleFlag();
+      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
+          team_thread->th.th_sleep_loc != NULL)
+        __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
+                                  team_thread->th.th_sleep_loc);
+    }
+#endif
+#if USE_ITT_BUILD
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Join barrier - report frame end
+    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+        __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
+        team->t.t_active_level == 1) {
+      kmp_uint64 cur_time = __itt_get_timestamp();
+      ident_t *loc = team->t.t_ident;
+      kmp_info_t **other_threads = team->t.t_threads;
+      int nproc = this_thr->th.th_team_nproc;
+      int i;
+      switch (__kmp_forkjoin_frames_mode) {
+      case 1:
+        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                               loc, nproc);
+        break;
+      case 2:
+        __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
+                               loc, nproc);
+        break;
+      case 3:
+        if (__itt_metadata_add_ptr) {
+          // Initialize with master's wait time
+          kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+          // Set arrive time to zero to be able to check it in
+          // __kmp_invoke_task(); the same is done inside the loop below
+          this_thr->th.th_bar_arrive_time = 0;
+          for (i = 1; i < nproc; ++i) {
+            delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
+            other_threads[i]->th.th_bar_arrive_time = 0;
+          }
+          __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
+                                       cur_time, delta, 0);
+        }
+        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
+                               loc, nproc);
+        this_thr->th.th_frame_time = cur_time;
+        break;
+      }
+    }
+#endif /* USE_ITT_BUILD */
+  }
+#if USE_ITT_BUILD
+  else {
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+  }
+#endif /* USE_ITT_BUILD */
+
+#if KMP_DEBUG
+  if (KMP_MASTER_TID(tid)) {
+    KA_TRACE(
+        15,
+        ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
+         gtid, team_id, tid, nproc));
+  }
+#endif /* KMP_DEBUG */
+
+  // TODO now, mark worker threads as done so they may be disbanded
+  KMP_MB(); // Flush all pending memory write invalidates.
+  KA_TRACE(10,
+           ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
+
+  ANNOTATE_BARRIER_END(&team->t.t_bar);
+}
+
+// TODO release worker threads' fork barriers as we are ready instead of all at
+// once
+void __kmp_fork_barrier(int gtid, int tid) {
+  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
+  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
+#if USE_ITT_BUILD
+  void *itt_sync_obj = NULL;
+#endif /* USE_ITT_BUILD */
+  if (team)
+    ANNOTATE_BARRIER_END(&team->t.t_bar);
+
+  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                (team != NULL) ? team->t.t_id : -1, tid));
+
+  // th_team pointer only valid for master thread here
+  if (KMP_MASTER_TID(tid)) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+      // Create itt barrier object
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
+      __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
+    }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+
+#ifdef KMP_DEBUG
+    kmp_info_t **other_threads = team->t.t_threads;
+    int i;
+
+    // Verify state
+    KMP_MB();
+
+    for (i = 1; i < team->t.t_nproc; ++i) {
+      KA_TRACE(500,
+               ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
+                "== %u.\n",
+                gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
+                team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
+                other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
+      KMP_DEBUG_ASSERT(
+          (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
+           ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
+      KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
+    }
+#endif
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // 0 indicates setup current task team if nthreads > 1
+      __kmp_task_team_setup(this_thr, team, 0);
+    }
+
+    /* The master thread may have changed its blocktime between the join barrier
+       and the fork barrier. Copy the blocktime info to the thread, where
+       __kmp_wait_template() can access it when the team struct is not
+       guaranteed to exist. */
+    // See note about the corresponding code in __kmp_join_barrier() being
+    // performance-critical
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+#if KMP_USE_MONITOR
+      this_thr->th.th_team_bt_intervals =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+      this_thr->th.th_team_bt_set =
+          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+#else
+      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
+#endif
+    }
+  } // master
+
+  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
+  case bp_hyper_bar: {
+    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+    __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_hierarchical_bar: {
+    __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                       TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  case bp_tree_bar: {
+    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+    __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    break;
+  }
+  default: {
+    __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
+                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  }
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+    int ds_tid = this_thr->th.th_info.ds.ds_tid;
+    ompt_data_t *task_data = (team)
+                                 ? OMPT_CUR_TASK_DATA(this_thr)
+                                 : &(this_thr->th.ompt_thread_info.task_data);
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = team->t.ompt_team_info.master_return_address;
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+    }
+  }
+#endif
+
+  // Early exit for reaping threads releasing forkjoin barrier
+  if (TCR_4(__kmp_global.g.g_done)) {
+    this_thr->th.th_task_team = NULL;
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+      if (!KMP_MASTER_TID(tid)) {
+        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+        if (itt_sync_obj)
+          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+      }
+    }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
+    return;
+  }
+
+  /* We can now assume that a valid team structure has been allocated by the
+     master and propagated to all worker threads. The current thread, however,
+     may not be part of the team, so we can't blindly assume that the team
+     pointer is non-null.  */
+  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
+  KMP_DEBUG_ASSERT(team != NULL);
+  tid = __kmp_tid_from_gtid(gtid);
+
+#if KMP_BARRIER_ICV_PULL
+  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
+     __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+     implicit task has this data before this function is called. We cannot
+     modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
+     struct, because it is not always the case that the threads arrays have
+     been allocated when __kmp_fork_call() is executed. */
+  {
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
+    if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
+      // Copy the initial ICVs from the master's thread struct to the implicit
+      // task for this tid.
+      KA_TRACE(10,
+               ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
+      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
+                               tid, FALSE);
+      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                &team->t.t_threads[0]
+                     ->th.th_bar[bs_forkjoin_barrier]
+                     .bb.th_fixed_icvs);
+    }
+  }
+#endif // KMP_BARRIER_ICV_PULL
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    __kmp_task_team_sync(this_thr, team);
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
+  if (proc_bind == proc_bind_intel) {
+    // Call dynamic affinity settings
+    if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
+      __kmp_balanced_affinity(this_thr, team->t.t_nproc);
+    }
+  } else if (proc_bind != proc_bind_false) {
+    if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
+      KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
+                     __kmp_gtid_from_thread(this_thr),
+                     this_thr->th.th_current_place));
+    } else {
+      __kmp_affinity_set_place(gtid);
+    }
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+  // Perform the display affinity functionality
+  if (__kmp_display_affinity) {
+    if (team->t.t_display_affinity
+#if KMP_AFFINITY_SUPPORTED
+        || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
+#endif
+            ) {
+      // NULL means use the affinity-format-var ICV
+      __kmp_aux_display_affinity(gtid, NULL);
+      this_thr->th.th_prev_num_threads = team->t.t_nproc;
+      this_thr->th.th_prev_level = team->t.t_level;
+    }
+  }
+  if (!KMP_MASTER_TID(tid))
+    KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+    if (!KMP_MASTER_TID(tid)) {
+      // Get correct barrier object
+      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+      __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
+    } // (prepare called inside barrier_release)
+  }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+  ANNOTATE_BARRIER_END(&team->t.t_bar);
+  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
+                team->t.t_id, tid));
+}
+
+void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
+                          kmp_internal_control_t *new_icvs, ident_t *loc) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
+
+  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
+  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
+
+/* Master thread's copy of the ICVs was set up on the implicit taskdata in
+   __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
+   implicit task has this data before this function is called. */
+#if KMP_BARRIER_ICV_PULL
+  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
+     untouched), where all of the worker threads can access them and make their
+     own copies after the barrier. */
+  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
+  // allocated at this point
+  copy_icvs(
+      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
+      new_icvs);
+  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
+                team->t.t_threads[0], team));
+#elif KMP_BARRIER_ICV_PUSH
+  // The ICVs will be propagated in the fork barrier, so nothing needs to be
+  // done here.
+  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
+                team->t.t_threads[0], team));
+#else
+  // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
+  // time.
+  ngo_load(new_icvs);
+  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
+  // allocated at this point
+  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
+    // TODO: GEH - pass in better source location info since usually NULL here
+    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                  f, team->t.t_threads[f], team));
+    __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
+    ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
+    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                  f, team->t.t_threads[f], team));
+  }
+  ngo_sync();
+#endif // KMP_BARRIER_ICV_PULL
+}
diff --git a/final/runtime/src/kmp_cancel.cpp b/final/runtime/src/kmp_cancel.cpp
new file mode 100644
index 0000000..d129049
--- /dev/null
+++ b/final/runtime/src/kmp_cancel.cpp
@@ -0,0 +1,331 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_str.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+@param cncl_kind Cancellation kind (parallel, for, sections, taskgroup)
+
+@return returns true if the cancellation request has been activated and the
+execution thread needs to proceed to the end of the canceled region.
+
+Request cancellation of the binding OpenMP region.
+*/
+kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+  KC_TRACE(10, ("__kmpc_cancel: T#%d request %d OMP_CANCELLATION=%d\n", gtid,
+                cncl_kind, __kmp_omp_cancellation));
+
+  KMP_DEBUG_ASSERT(cncl_kind != cancel_noreq);
+  KMP_DEBUG_ASSERT(cncl_kind == cancel_parallel || cncl_kind == cancel_loop ||
+                   cncl_kind == cancel_sections ||
+                   cncl_kind == cancel_taskgroup);
+  KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+  if (__kmp_omp_cancellation) {
+    switch (cncl_kind) {
+    case cancel_parallel:
+    case cancel_loop:
+    case cancel_sections:
+      // cancellation requests for parallel and worksharing constructs
+      // are handled through the team structure
+      {
+        kmp_team_t *this_team = this_thr->th.th_team;
+        KMP_DEBUG_ASSERT(this_team);
+        kmp_int32 old = cancel_noreq;
+        this_team->t.t_cancel_request.compare_exchange_strong(old, cncl_kind);
+        if (old == cancel_noreq || old == cncl_kind) {
+// we do not have a cancellation request in this team or we do have
+// one that matches the current request -> cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+          if (ompt_enabled.ompt_callback_cancel) {
+            ompt_data_t *task_data;
+            __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                          NULL);
+            ompt_cancel_flag_t type = ompt_cancel_parallel;
+            if (cncl_kind == cancel_parallel)
+              type = ompt_cancel_parallel;
+            else if (cncl_kind == cancel_loop)
+              type = ompt_cancel_loop;
+            else if (cncl_kind == cancel_sections)
+              type = ompt_cancel_sections;
+            ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                task_data, type | ompt_cancel_activated,
+                OMPT_GET_RETURN_ADDRESS(0));
+          }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+          return 1 /* true */;
+        }
+        break;
+      }
+    case cancel_taskgroup:
+      // cancellation requests for a task group
+      // are handled through the taskgroup structure
+      {
+        kmp_taskdata_t *task;
+        kmp_taskgroup_t *taskgroup;
+
+        task = this_thr->th.th_current_task;
+        KMP_DEBUG_ASSERT(task);
+
+        taskgroup = task->td_taskgroup;
+        if (taskgroup) {
+          kmp_int32 old = cancel_noreq;
+          taskgroup->cancel_request.compare_exchange_strong(old, cncl_kind);
+          if (old == cancel_noreq || old == cncl_kind) {
+// we do not have a cancellation request in this taskgroup or we do
+// have one that matches the current request -> cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+            if (ompt_enabled.ompt_callback_cancel) {
+              ompt_data_t *task_data;
+              __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                            NULL);
+              ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                  task_data, ompt_cancel_taskgroup | ompt_cancel_activated,
+                  OMPT_GET_RETURN_ADDRESS(0));
+            }
+#endif
+            return 1 /* true */;
+          }
+        } else {
+          // TODO: what needs to happen here?
+          // the specification disallows cancellation w/o taskgroups
+          // so we might do anything here, let's abort for now
+          KMP_ASSERT(0 /* false */);
+        }
+      }
+      break;
+    default:
+      KMP_ASSERT(0 /* false */);
+    }
+  }
+
+  // ICV OMP_CANCELLATION=false, so we ignored this cancel request
+  KMP_DEBUG_ASSERT(!__kmp_omp_cancellation);
+  return 0 /* false */;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+@param cncl_kind Cancellation kind (parallel, for, sections, taskgroup)
+
+@return returns true if a matching cancellation request has been flagged in the
+RTL and the encountering thread has to cancel..
+
+Cancellation point for the encountering thread.
+*/
+kmp_int32 __kmpc_cancellationpoint(ident_t *loc_ref, kmp_int32 gtid,
+                                   kmp_int32 cncl_kind) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+  KC_TRACE(10,
+           ("__kmpc_cancellationpoint: T#%d request %d OMP_CANCELLATION=%d\n",
+            gtid, cncl_kind, __kmp_omp_cancellation));
+
+  KMP_DEBUG_ASSERT(cncl_kind != cancel_noreq);
+  KMP_DEBUG_ASSERT(cncl_kind == cancel_parallel || cncl_kind == cancel_loop ||
+                   cncl_kind == cancel_sections ||
+                   cncl_kind == cancel_taskgroup);
+  KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+  if (__kmp_omp_cancellation) {
+    switch (cncl_kind) {
+    case cancel_parallel:
+    case cancel_loop:
+    case cancel_sections:
+      // cancellation requests for parallel and worksharing constructs
+      // are handled through the team structure
+      {
+        kmp_team_t *this_team = this_thr->th.th_team;
+        KMP_DEBUG_ASSERT(this_team);
+        if (this_team->t.t_cancel_request) {
+          if (cncl_kind == this_team->t.t_cancel_request) {
+// the request in the team structure matches the type of
+// cancellation point so we can cancel
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+            if (ompt_enabled.ompt_callback_cancel) {
+              ompt_data_t *task_data;
+              __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                            NULL);
+              ompt_cancel_flag_t type = ompt_cancel_parallel;
+              if (cncl_kind == cancel_parallel)
+                type = ompt_cancel_parallel;
+              else if (cncl_kind == cancel_loop)
+                type = ompt_cancel_loop;
+              else if (cncl_kind == cancel_sections)
+                type = ompt_cancel_sections;
+              ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                  task_data, type | ompt_cancel_detected,
+                  OMPT_GET_RETURN_ADDRESS(0));
+            }
+#endif
+            return 1 /* true */;
+          }
+          KMP_ASSERT(0 /* false */);
+        } else {
+          // we do not have a cancellation request pending, so we just
+          // ignore this cancellation point
+          return 0;
+        }
+        break;
+      }
+    case cancel_taskgroup:
+      // cancellation requests for a task group
+      // are handled through the taskgroup structure
+      {
+        kmp_taskdata_t *task;
+        kmp_taskgroup_t *taskgroup;
+
+        task = this_thr->th.th_current_task;
+        KMP_DEBUG_ASSERT(task);
+
+        taskgroup = task->td_taskgroup;
+        if (taskgroup) {
+// return the current status of cancellation for the taskgroup
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+          if (ompt_enabled.ompt_callback_cancel &&
+              !!taskgroup->cancel_request) {
+            ompt_data_t *task_data;
+            __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL,
+                                          NULL);
+            ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+                task_data, ompt_cancel_taskgroup | ompt_cancel_detected,
+                OMPT_GET_RETURN_ADDRESS(0));
+          }
+#endif
+          return !!taskgroup->cancel_request;
+        } else {
+          // if a cancellation point is encountered by a task that does not
+          // belong to a taskgroup, it is OK to ignore it
+          return 0 /* false */;
+        }
+      }
+    default:
+      KMP_ASSERT(0 /* false */);
+    }
+  }
+
+  // ICV OMP_CANCELLATION=false, so we ignore the cancellation point
+  KMP_DEBUG_ASSERT(!__kmp_omp_cancellation);
+  return 0 /* false */;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+
+@return returns true if a matching cancellation request has been flagged in the
+RTL and the encountering thread has to cancel..
+
+Barrier with cancellation point to send threads from the barrier to the
+end of the parallel region.  Needs a special code pattern as documented
+in the design document for the cancellation feature.
+*/
+kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32 gtid) {
+  int ret = 0 /* false */;
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *this_team = this_thr->th.th_team;
+
+  KMP_DEBUG_ASSERT(__kmp_get_gtid() == gtid);
+
+  // call into the standard barrier
+  __kmpc_barrier(loc, gtid);
+
+  // if cancellation is active, check cancellation flag
+  if (__kmp_omp_cancellation) {
+    // depending on which construct to cancel, check the flag and
+    // reset the flag
+    switch (KMP_ATOMIC_LD_RLX(&(this_team->t.t_cancel_request))) {
+    case cancel_parallel:
+      ret = 1;
+      // ensure that threads have checked the flag, when
+      // leaving the above barrier
+      __kmpc_barrier(loc, gtid);
+      this_team->t.t_cancel_request = cancel_noreq;
+      // the next barrier is the fork/join barrier, which
+      // synchronizes the threads leaving here
+      break;
+    case cancel_loop:
+    case cancel_sections:
+      ret = 1;
+      // ensure that threads have checked the flag, when
+      // leaving the above barrier
+      __kmpc_barrier(loc, gtid);
+      this_team->t.t_cancel_request = cancel_noreq;
+      // synchronize the threads again to make sure we do not have any run-away
+      // threads that cause a race on the cancellation flag
+      __kmpc_barrier(loc, gtid);
+      break;
+    case cancel_taskgroup:
+      // this case should not occur
+      KMP_ASSERT(0 /* false */);
+      break;
+    case cancel_noreq:
+      // do nothing
+      break;
+    default:
+      KMP_ASSERT(0 /* false */);
+    }
+  }
+
+  return ret;
+}
+
+/*!
+@ingroup CANCELLATION
+@param loc_ref location of the original task directive
+@param gtid Global thread ID of encountering thread
+
+@return returns true if a matching cancellation request has been flagged in the
+RTL and the encountering thread has to cancel..
+
+Query function to query the current status of cancellation requests.
+Can be used to implement the following pattern:
+
+if (kmp_get_cancellation_status(kmp_cancel_parallel)) {
+    perform_cleanup();
+    #pragma omp cancellation point parallel
+}
+*/
+int __kmp_get_cancellation_status(int cancel_kind) {
+  if (__kmp_omp_cancellation) {
+    kmp_info_t *this_thr = __kmp_entry_thread();
+
+    switch (cancel_kind) {
+    case cancel_parallel:
+    case cancel_loop:
+    case cancel_sections: {
+      kmp_team_t *this_team = this_thr->th.th_team;
+      return this_team->t.t_cancel_request == cancel_kind;
+    }
+    case cancel_taskgroup: {
+      kmp_taskdata_t *task;
+      kmp_taskgroup_t *taskgroup;
+      task = this_thr->th.th_current_task;
+      taskgroup = task->td_taskgroup;
+      return taskgroup && taskgroup->cancel_request;
+    }
+    }
+  }
+
+  return 0 /* false */;
+}
diff --git a/final/runtime/src/kmp_config.h.cmake b/final/runtime/src/kmp_config.h.cmake
new file mode 100644
index 0000000..e3a1a8d
--- /dev/null
+++ b/final/runtime/src/kmp_config.h.cmake
@@ -0,0 +1,111 @@
+/*
+ * kmp_config.h -- Feature macros
+ */
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef KMP_CONFIG_H
+#define KMP_CONFIG_H
+
+#include "kmp_platform.h"
+
+// cmakedefine01 MACRO will define MACRO as either 0 or 1
+// cmakedefine MACRO 1 will define MACRO as 1 or leave undefined
+#cmakedefine01 DEBUG_BUILD
+#cmakedefine01 RELWITHDEBINFO_BUILD
+#cmakedefine01 LIBOMP_USE_ITT_NOTIFY
+#define USE_ITT_NOTIFY LIBOMP_USE_ITT_NOTIFY
+#if ! LIBOMP_USE_ITT_NOTIFY
+# define INTEL_NO_ITTNOTIFY_API
+#endif
+#cmakedefine01 LIBOMP_USE_VERSION_SYMBOLS
+#if LIBOMP_USE_VERSION_SYMBOLS
+# define KMP_USE_VERSION_SYMBOLS
+#endif
+#cmakedefine01 LIBOMP_HAVE_WEAK_ATTRIBUTE
+#define KMP_HAVE_WEAK_ATTRIBUTE LIBOMP_HAVE_WEAK_ATTRIBUTE
+#cmakedefine01 LIBOMP_HAVE_PSAPI
+#define KMP_HAVE_PSAPI LIBOMP_HAVE_PSAPI
+#cmakedefine01 LIBOMP_STATS
+#define KMP_STATS_ENABLED LIBOMP_STATS
+#cmakedefine01 LIBOMP_HAVE_X86INTRIN_H
+#define KMP_HAVE_X86INTRIN_H LIBOMP_HAVE_X86INTRIN_H
+#cmakedefine01 LIBOMP_HAVE___BUILTIN_READCYCLECOUNTER
+#define KMP_HAVE___BUILTIN_READCYCLECOUNTER LIBOMP_HAVE___BUILTIN_READCYCLECOUNTER
+#cmakedefine01 LIBOMP_HAVE___RDTSC
+#define KMP_HAVE___RDTSC LIBOMP_HAVE___RDTSC
+#cmakedefine01 LIBOMP_USE_DEBUGGER
+#define USE_DEBUGGER LIBOMP_USE_DEBUGGER
+#cmakedefine01 LIBOMP_OMPT_DEBUG
+#define OMPT_DEBUG LIBOMP_OMPT_DEBUG
+#cmakedefine01 LIBOMP_OMPT_SUPPORT
+#define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
+#cmakedefine01 LIBOMP_OMPT_OPTIONAL
+#define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
+#cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
+#define KMP_USE_ADAPTIVE_LOCKS LIBOMP_USE_ADAPTIVE_LOCKS
+#define KMP_DEBUG_ADAPTIVE_LOCKS 0
+#cmakedefine01 LIBOMP_USE_INTERNODE_ALIGNMENT
+#define KMP_USE_INTERNODE_ALIGNMENT LIBOMP_USE_INTERNODE_ALIGNMENT
+#cmakedefine01 LIBOMP_ENABLE_ASSERTIONS
+#define KMP_USE_ASSERT LIBOMP_ENABLE_ASSERTIONS
+#cmakedefine01 LIBOMP_USE_HIER_SCHED
+#define KMP_USE_HIER_SCHED LIBOMP_USE_HIER_SCHED
+#cmakedefine01 STUBS_LIBRARY
+#cmakedefine01 LIBOMP_USE_HWLOC
+#define KMP_USE_HWLOC LIBOMP_USE_HWLOC
+#cmakedefine01 LIBOMP_ENABLE_SHARED
+#define KMP_DYNAMIC_LIB LIBOMP_ENABLE_SHARED
+#define KMP_ARCH_STR "@LIBOMP_LEGAL_ARCH@"
+#define KMP_LIBRARY_FILE "@LIBOMP_LIB_FILE@"
+#define KMP_VERSION_MAJOR @LIBOMP_VERSION_MAJOR@
+#define KMP_VERSION_MINOR @LIBOMP_VERSION_MINOR@
+#cmakedefine01 LIBOMP_TSAN_SUPPORT
+#if LIBOMP_TSAN_SUPPORT
+#define TSAN_SUPPORT
+#endif
+#cmakedefine01 MSVC
+#define KMP_MSVC_COMPAT MSVC
+
+// Configured cache line based on architecture
+#if KMP_ARCH_PPC64
+# define CACHE_LINE 128
+#else
+# define CACHE_LINE 64
+#endif
+
+#if ! KMP_32_BIT_ARCH
+# define BUILD_I8 1
+#endif
+
+#define KMP_NESTED_HOT_TEAMS 1
+#define KMP_ADJUST_BLOCKTIME 1
+#define BUILD_PARALLEL_ORDERED 1
+#define KMP_ASM_INTRINS 1
+#define USE_ITT_BUILD LIBOMP_USE_ITT_NOTIFY
+#define INTEL_ITTNOTIFY_PREFIX __kmp_itt_
+#if ! KMP_MIC
+# define USE_LOAD_BALANCE 1
+#endif
+#if ! (KMP_OS_WINDOWS || KMP_OS_DARWIN)
+# define KMP_TDATA_GTID 1
+#endif
+#if STUBS_LIBRARY
+# define KMP_STUB 1
+#endif
+#if DEBUG_BUILD || RELWITHDEBINFO_BUILD
+# define KMP_DEBUG 1
+#endif
+
+#if KMP_OS_WINDOWS
+# define KMP_WIN_CDECL
+#else
+# define BUILD_TV
+# define KMP_GOMP_COMPAT
+#endif
+
+#endif // KMP_CONFIG_H
diff --git a/final/runtime/src/kmp_csupport.cpp b/final/runtime/src/kmp_csupport.cpp
new file mode 100644
index 0000000..c778c97
--- /dev/null
+++ b/final/runtime/src/kmp_csupport.cpp
@@ -0,0 +1,4187 @@
+/*
+ * kmp_csupport.cpp -- kfront linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define __KMP_IMP
+#include "omp.h" /* extern "C" declarations of user-visible routines */
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#define MAX_MESSAGE 512
+
+// flags will be used in future, e.g. to implement openmp_strict library
+// restrictions
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc   in   source location information
+ * @param flags in   for future use (currently ignored)
+ *
+ * Initialize the runtime library. This call is optional; if it is not made then
+ * it will be implicitly called by attempts to use other library functions.
+ */
+void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
+  // By default __kmpc_begin() is no-op.
+  char *env;
+  if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
+      __kmp_str_match_true(env)) {
+    __kmp_middle_initialize();
+    KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
+  } else if (__kmp_ignore_mppbeg() == FALSE) {
+    // By default __kmp_ignore_mppbeg() returns TRUE.
+    __kmp_internal_begin();
+    KC_TRACE(10, ("__kmpc_begin: called\n"));
+  }
+}
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc source location information
+ *
+ * Shutdown the runtime library. This is also optional, and even if called will
+ * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
+ * zero.
+ */
+void __kmpc_end(ident_t *loc) {
+  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
+  // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
+  // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
+  // returns FALSE and __kmpc_end() will unregister this root (it can cause
+  // library shut down).
+  if (__kmp_ignore_mppend() == FALSE) {
+    KC_TRACE(10, ("__kmpc_end: called\n"));
+    KA_TRACE(30, ("__kmpc_end\n"));
+
+    __kmp_internal_end_thread(-1);
+  }
+#if KMP_OS_WINDOWS && OMPT_SUPPORT
+  // Normal exit process on Windows does not allow worker threads of the final
+  // parallel region to finish reporting their events, so shutting down the
+  // library here fixes the issue at least for the cases where __kmpc_end() is
+  // placed properly.
+  if (ompt_enabled.enabled)
+    __kmp_internal_end_library(__kmp_gtid_get_specific());
+#endif
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The global thread index of the active thread.
+
+This function can be called in any context.
+
+If the runtime has ony been entered at the outermost level from a
+single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
+that which would be returned by omp_get_thread_num() in the outermost
+active parallel construct. (Or zero if there is no active parallel
+construct, since the master thread is necessarily thread zero).
+
+If multiple non-OpenMP threads all enter an OpenMP construct then this
+will be a unique thread identifier among all the threads created by
+the OpenMP runtime (but the value cannote be defined in terms of
+OpenMP thread ids returned by omp_get_thread_num()).
+*/
+kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
+  kmp_int32 gtid = __kmp_entry_gtid();
+
+  KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
+
+  return gtid;
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads under control of the OpenMP<sup>*</sup> runtime
+
+This function can be called in any context.
+It returns the total number of threads under the control of the OpenMP runtime.
+That is not a number that can be determined by any OpenMP standard calls, since
+the library may be called from more than one non-OpenMP thread, and this
+reflects the total over all such calls. Similarly the runtime maintains
+underlying threads even when they are not active (since the cost of creating
+and destroying OS threads is high), this call counts all such threads even if
+they are not waiting for work.
+*/
+kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
+  KC_TRACE(10,
+           ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
+
+  return TCR_4(__kmp_all_nth);
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The thread number of the calling thread in the innermost active parallel
+construct.
+*/
+kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
+  KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
+  return __kmp_tid_from_gtid(__kmp_entry_gtid());
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads in the innermost active parallel construct.
+*/
+kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
+  KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
+
+  return __kmp_entry_thread()->th.th_team->t.t_nproc;
+}
+
+/*!
+ * @ingroup DEPRECATED
+ * @param loc location description
+ *
+ * This function need not be called. It always returns TRUE.
+ */
+kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
+#ifndef KMP_DEBUG
+
+  return TRUE;
+
+#else
+
+  const char *semi2;
+  const char *semi3;
+  int line_no;
+
+  if (__kmp_par_range == 0) {
+    return TRUE;
+  }
+  semi2 = loc->psource;
+  if (semi2 == NULL) {
+    return TRUE;
+  }
+  semi2 = strchr(semi2, ';');
+  if (semi2 == NULL) {
+    return TRUE;
+  }
+  semi2 = strchr(semi2 + 1, ';');
+  if (semi2 == NULL) {
+    return TRUE;
+  }
+  if (__kmp_par_range_filename[0]) {
+    const char *name = semi2 - 1;
+    while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
+      name--;
+    }
+    if ((*name == '/') || (*name == ';')) {
+      name++;
+    }
+    if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
+      return __kmp_par_range < 0;
+    }
+  }
+  semi3 = strchr(semi2 + 1, ';');
+  if (__kmp_par_range_routine[0]) {
+    if ((semi3 != NULL) && (semi3 > semi2) &&
+        (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
+      return __kmp_par_range < 0;
+    }
+  }
+  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
+    if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
+      return __kmp_par_range > 0;
+    }
+    return __kmp_par_range < 0;
+  }
+  return TRUE;
+
+#endif /* KMP_DEBUG */
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return 1 if this thread is executing inside an active parallel region, zero if
+not.
+*/
+kmp_int32 __kmpc_in_parallel(ident_t *loc) {
+  return __kmp_entry_thread()->th.th_root->r.r_active;
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_threads number of threads requested for this parallel construct
+
+Set the number of threads to be used by the next fork spawned by this thread.
+This call is only required if the parallel construct has a `num_threads` clause.
+*/
+void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
+                             kmp_int32 num_threads) {
+  KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
+                global_tid, num_threads));
+
+  __kmp_push_num_threads(loc, global_tid, num_threads);
+}
+
+void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
+  KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
+
+  /* the num_threads are automatically popped */
+}
+
+void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
+                           kmp_int32 proc_bind) {
+  KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
+                proc_bind));
+
+  __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param argc  total number of arguments in the ellipsis
+@param microtask  pointer to callback routine consisting of outlined parallel
+construct
+@param ...  pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
+  int gtid = __kmp_entry_gtid();
+
+#if (KMP_STATS_ENABLED)
+  // If we were in a serial region, then stop the serial timer, record
+  // the event, and start parallel region timer
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
+  }
+  int inParallel = __kmpc_in_parallel(loc);
+  if (inParallel) {
+    KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
+  } else {
+    KMP_COUNT_BLOCK(OMP_PARALLEL);
+  }
+#endif
+
+  // maybe to save thr_state is enough here
+  {
+    va_list ap;
+    va_start(ap, microtask);
+
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      kmp_info_t *master_th = __kmp_threads[gtid];
+      kmp_team_t *parent_team = master_th->th.th_team;
+      ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
+      if (lwt)
+        ompt_frame = &(lwt->ompt_task_info.frame);
+      else {
+        int tid = __kmp_tid_from_gtid(gtid);
+        ompt_frame = &(
+            parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
+      }
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(gtid);
+    }
+#endif
+
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_FORKING();
+#endif
+    __kmp_fork_call(loc, gtid, fork_context_intel, argc,
+                    VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
+                    VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                    &ap
+#else
+                    ap
+#endif
+                    );
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_JOINING();
+#endif
+    __kmp_join_call(loc, gtid
+#if OMPT_SUPPORT
+                    ,
+                    fork_context_intel
+#endif
+                    );
+
+    va_end(ap);
+  }
+
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+  } else {
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif // KMP_STATS_ENABLED
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_teams number of teams requested for the teams construct
+@param num_threads number of threads per team requested for the teams construct
+
+Set the number of teams to be used by the teams construct.
+This call is only required if the teams construct has a `num_teams` clause
+or a `thread_limit` clause (or both).
+*/
+void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
+                           kmp_int32 num_teams, kmp_int32 num_threads) {
+  KA_TRACE(20,
+           ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
+            global_tid, num_teams, num_threads));
+
+  __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param argc  total number of arguments in the ellipsis
+@param microtask  pointer to callback routine consisting of outlined teams
+construct
+@param ...  pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
+                       ...) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  va_list ap;
+  va_start(ap, microtask);
+
+#if KMP_STATS_ENABLED
+  KMP_COUNT_BLOCK(OMP_TEAMS);
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
+  }
+#endif
+
+  // remember teams entry point and nesting level
+  this_thr->th.th_teams_microtask = microtask;
+  this_thr->th.th_teams_level =
+      this_thr->th.th_team->t.t_level; // AC: can be >0 on host
+
+#if OMPT_SUPPORT
+  kmp_team_t *parent_team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(gtid);
+  if (ompt_enabled.enabled) {
+    parent_team->t.t_implicit_task_taskdata[tid]
+        .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  // check if __kmpc_push_num_teams called, set default number of teams
+  // otherwise
+  if (this_thr->th.th_teams_size.nteams == 0) {
+    __kmp_push_num_teams(loc, gtid, 0, 0);
+  }
+  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
+  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
+  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
+
+  __kmp_fork_call(loc, gtid, fork_context_intel, argc,
+                  VOLATILE_CAST(microtask_t)
+                      __kmp_teams_master, // "wrapped" task
+                  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                  &ap
+#else
+                  ap
+#endif
+                  );
+  __kmp_join_call(loc, gtid
+#if OMPT_SUPPORT
+                  ,
+                  fork_context_intel
+#endif
+                  );
+
+  // Pop current CG root off list
+  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
+  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
+  this_thr->th.th_cg_roots = tmp->up;
+  KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
+                 " to node %p. cg_nthreads was %d\n",
+                 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
+  KMP_DEBUG_ASSERT(tmp->cg_nthreads);
+  int i = tmp->cg_nthreads--;
+  if (i == 1) { // check is we are the last thread in CG (not always the case)
+    __kmp_free(tmp);
+  }
+  // Restore current task's thread_limit from CG root
+  KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
+  this_thr->th.th_current_task->td_icvs.thread_limit =
+      this_thr->th.th_cg_roots->cg_thread_limit;
+
+  this_thr->th.th_teams_microtask = NULL;
+  this_thr->th.th_teams_level = 0;
+  *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
+  va_end(ap);
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+  } else {
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif // KMP_STATS_ENABLED
+}
+
+// I don't think this function should ever have been exported.
+// The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
+// openmp code ever called it, but it's been exported from the RTL for so
+// long that I'm afraid to remove the definition.
+int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param global_tid  global thread number
+
+Enter a serialized parallel construct. This interface is used to handle a
+conditional parallel region, like this,
+@code
+#pragma omp parallel if (condition)
+@endcode
+when the condition is false.
+*/
+void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
+// The implementation is now in kmp_runtime.cpp so that it can share static
+// functions with kmp_fork_call since the tasks to be done are similar in
+// each case.
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif
+  __kmp_serialized_parallel(loc, global_tid);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc  source location information
+@param global_tid  global thread number
+
+Leave a serialized parallel construct.
+*/
+void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
+  kmp_internal_control_t *top;
+  kmp_info_t *this_thr;
+  kmp_team_t *serial_team;
+
+  KC_TRACE(10,
+           ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
+
+  /* skip all this code for autopar serialized loops since it results in
+     unacceptable overhead */
+  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
+    return;
+
+  // Not autopar code
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  this_thr = __kmp_threads[global_tid];
+  serial_team = this_thr->th.th_serial_team;
+
+  kmp_task_team_t *task_team = this_thr->th.th_task_team;
+  // we need to wait for the proxy tasks before finishing the thread
+  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
+    __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
+
+  KMP_MB();
+  KMP_DEBUG_ASSERT(serial_team);
+  KMP_ASSERT(serial_team->t.t_serialized);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
+  KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
+  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
+  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
+          OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
+    }
+
+    // reset clear the task id only after unlinking the task
+    ompt_data_t *parent_task_data;
+    __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
+
+    if (ompt_enabled.ompt_callback_parallel_end) {
+      ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+          &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
+          ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
+    }
+    __ompt_lw_taskteam_unlink(this_thr);
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+  }
+#endif
+
+  /* If necessary, pop the internal control stack values and replace the team
+   * values */
+  top = serial_team->t.t_control_stack_top;
+  if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
+    copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
+    serial_team->t.t_control_stack_top = top->next;
+    __kmp_free(top);
+  }
+
+  // if( serial_team -> t.t_serialized > 1 )
+  serial_team->t.t_level--;
+
+  /* pop dispatch buffers stack */
+  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
+  {
+    dispatch_private_info_t *disp_buffer =
+        serial_team->t.t_dispatch->th_disp_buffer;
+    serial_team->t.t_dispatch->th_disp_buffer =
+        serial_team->t.t_dispatch->th_disp_buffer->next;
+    __kmp_free(disp_buffer);
+  }
+  this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
+
+  --serial_team->t.t_serialized;
+  if (serial_team->t.t_serialized == 0) {
+
+/* return to the parallel section */
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
+      __kmp_clear_x87_fpu_status_word();
+      __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
+      __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
+    }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+    this_thr->th.th_team = serial_team->t.t_parent;
+    this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
+
+    /* restore values cached in the thread */
+    this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
+    this_thr->th.th_team_master =
+        serial_team->t.t_parent->t.t_threads[0]; /* JPH */
+    this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
+
+    /* TODO the below shouldn't need to be adjusted for serialized teams */
+    this_thr->th.th_dispatch =
+        &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
+
+    __kmp_pop_current_task_from_thread(this_thr);
+
+    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
+    this_thr->th.th_current_task->td_flags.executing = 1;
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Copy the task team from the new child / old parent team to the thread.
+      this_thr->th.th_task_team =
+          this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
+      KA_TRACE(20,
+               ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
+                "team %p\n",
+                global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
+    }
+  } else {
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
+                    "depth of serial team %p to %d\n",
+                    global_tid, serial_team, serial_team->t.t_serialized));
+    }
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_parallel(global_tid, NULL);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    this_thr->th.ompt_thread_info.state =
+        ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
+                                           : ompt_state_work_parallel);
+#endif
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc  source location information.
+
+Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
+depending on the memory ordering convention obeyed by the compiler
+even that may not be necessary).
+*/
+void __kmpc_flush(ident_t *loc) {
+  KC_TRACE(10, ("__kmpc_flush: called\n"));
+
+  /* need explicit __mf() here since use volatile instead in library */
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+#if KMP_MIC
+// fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
+// We shouldn't need it, though, since the ABI rules require that
+// * If the compiler generates NGO stores it also generates the fence
+// * If users hand-code NGO stores they should insert the fence
+// therefore no incomplete unordered stores should be visible.
+#else
+  // C74404
+  // This is to address non-temporal store instructions (sfence needed).
+  // The clflush instruction is addressed either (mfence needed).
+  // Probably the non-temporal load monvtdqa instruction should also be
+  // addressed.
+  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
+  if (!__kmp_cpuinfo.initialized) {
+    __kmp_query_cpuid(&__kmp_cpuinfo);
+  }
+  if (!__kmp_cpuinfo.sse2) {
+    // CPU cannot execute SSE2 instructions.
+  } else {
+#if KMP_COMPILER_ICC
+    _mm_mfence();
+#elif KMP_COMPILER_MSVC
+    MemoryBarrier();
+#else
+    __sync_synchronize();
+#endif // KMP_COMPILER_ICC
+  }
+#endif // KMP_MIC
+#elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
+// Nothing to see here move along
+#elif KMP_ARCH_PPC64
+// Nothing needed here (we have a real MB above).
+#if KMP_OS_CNK
+  // The flushing thread needs to yield here; this prevents a
+  // busy-waiting thread from saturating the pipeline. flush is
+  // often used in loops like this:
+  // while (!flag) {
+  //   #pragma omp flush(flag)
+  // }
+  // and adding the yield here is good for at least a 10x speedup
+  // when running >2 threads per core (on the NAS LU benchmark).
+  __kmp_yield();
+#endif
+#else
+#error Unknown or unsupported architecture
+#endif
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_flush) {
+    ompt_callbacks.ompt_callback(ompt_callback_flush)(
+        __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Execute a barrier.
+*/
+void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
+  KMP_COUNT_BLOCK(OMP_BARRIER);
+  KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
+    }
+    __kmp_check_barrier(global_tid, ct_barrier, loc);
+  }
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+  }
+#endif
+  __kmp_threads[global_tid]->th.th_ident = loc;
+  // TODO: explicit barrier_wait_id:
+  //   this function is called when 'barrier' directive is present or
+  //   implicit barrier at the end of a worksharing construct.
+  // 1) better to add a per-thread barrier counter to a thread data structure
+  // 2) set to 0 when a new team is created
+  // 4) no sync is required
+
+  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+/* The BARRIER for a MASTER section is always explicit   */
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
+*/
+kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
+  int status = 0;
+
+  KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (KMP_MASTER_GTID(global_tid)) {
+    KMP_COUNT_BLOCK(OMP_MASTER);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_master);
+    status = 1;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (status) {
+    if (ompt_enabled.ompt_callback_master) {
+      kmp_info_t *this_thr = __kmp_threads[global_tid];
+      kmp_team_t *team = this_thr->th.th_team;
+
+      int tid = __kmp_tid_from_gtid(global_tid);
+      ompt_callbacks.ompt_callback(ompt_callback_master)(
+          ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
+          &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (status)
+      __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
+    else
+      __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
+#else
+    if (status)
+      __kmp_push_sync(global_tid, ct_master, loc, NULL);
+    else
+      __kmp_check_sync(global_tid, ct_master, loc, NULL);
+#endif
+  }
+
+  return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+
+Mark the end of a <tt>master</tt> region. This should only be called by the
+thread that executes the <tt>master</tt> region.
+*/
+void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
+
+  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  if (ompt_enabled.ompt_callback_master) {
+    int tid = __kmp_tid_from_gtid(global_tid);
+    ompt_callbacks.ompt_callback(ompt_callback_master)(
+        ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    if (global_tid < 0)
+      KMP_WARNING(ThreadIdentInvalid);
+
+    if (KMP_MASTER_GTID(global_tid))
+      __kmp_pop_sync(global_tid, ct_master, loc);
+  }
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+
+Start execution of an <tt>ordered</tt> construct.
+*/
+void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
+  int cid = 0;
+  kmp_info_t *th;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+#if USE_ITT_BUILD
+  __kmp_itt_ordered_prep(gtid);
+// TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+  th = __kmp_threads[gtid];
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_team_t *team;
+  ompt_wait_id_t lck;
+  void *codeptr_ra;
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    team = __kmp_team_from_gtid(gtid);
+    lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
+    /* OMPT state update */
+    th->th.ompt_thread_info.wait_id = lck;
+    th->th.ompt_thread_info.state = ompt_state_wait_ordered;
+
+    /* OMPT event callback */
+    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
+          codeptr_ra);
+    }
+  }
+#endif
+
+  if (th->th.th_dispatch->th_deo_fcn != 0)
+    (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
+  else
+    __kmp_parallel_deo(&gtid, &cid, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    th->th.ompt_thread_info.state = ompt_state_work_parallel;
+    th->th.ompt_thread_info.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
+    }
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_ordered_start(gtid);
+#endif /* USE_ITT_BUILD */
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+
+End execution of an <tt>ordered</tt> construct.
+*/
+void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
+  int cid = 0;
+  kmp_info_t *th;
+
+  KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
+
+#if USE_ITT_BUILD
+  __kmp_itt_ordered_end(gtid);
+// TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+  th = __kmp_threads[gtid];
+
+  if (th->th.th_dispatch->th_dxo_fcn != 0)
+    (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
+  else
+    __kmp_parallel_dxo(&gtid, &cid, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_ordered,
+        (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
+            ->t.t_ordered.dt.t_value,
+        OMPT_LOAD_RETURN_ADDRESS(gtid));
+  }
+#endif
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+static __forceinline void
+__kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
+                          kmp_int32 gtid, kmp_indirect_locktag_t tag) {
+  // Pointer to the allocated indirect lock is written to crit, while indexing
+  // is ignored.
+  void *idx;
+  kmp_indirect_lock_t **lck;
+  lck = (kmp_indirect_lock_t **)crit;
+  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
+  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
+  KMP_SET_I_LOCK_LOCATION(ilk, loc);
+  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
+  KA_TRACE(20,
+           ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
+#if USE_ITT_BUILD
+  __kmp_itt_critical_creating(ilk->lock, loc);
+#endif
+  int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
+  if (status == 0) {
+#if USE_ITT_BUILD
+    __kmp_itt_critical_destroyed(ilk->lock);
+#endif
+    // We don't really need to destroy the unclaimed lock here since it will be
+    // cleaned up at program exit.
+    // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
+  }
+  KMP_DEBUG_ASSERT(*lck != NULL);
+}
+
+// Fast-path acquire tas lock
+#define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
+  {                                                                            \
+    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
+        !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
+      kmp_uint32 spins;                                                        \
+      KMP_FSYNC_PREPARE(l);                                                    \
+      KMP_INIT_YIELD(spins);                                                   \
+      kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
+      do {                                                                     \
+        if (TCR_4(__kmp_nth) >                                                 \
+            (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
+          KMP_YIELD(TRUE);                                                     \
+        } else {                                                               \
+          KMP_YIELD_SPIN(spins);                                               \
+        }                                                                      \
+        __kmp_spin_backoff(&backoff);                                          \
+      } while (                                                                \
+          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
+          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(l);                                                     \
+  }
+
+// Fast-path test tas lock
+#define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
+  {                                                                            \
+    kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
+         __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
+  }
+
+// Fast-path release tas lock
+#define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
+  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
+
+#if KMP_USE_FUTEX
+
+#include <sys/syscall.h>
+#include <unistd.h>
+#ifndef FUTEX_WAIT
+#define FUTEX_WAIT 0
+#endif
+#ifndef FUTEX_WAKE
+#define FUTEX_WAKE 1
+#endif
+
+// Fast-path acquire futex lock
+#define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
+  {                                                                            \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
+    kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
+    KMP_MB();                                                                  \
+    KMP_FSYNC_PREPARE(ftx);                                                    \
+    kmp_int32 poll_val;                                                        \
+    while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
+                &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
+                KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
+      kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
+      if (!cond) {                                                             \
+        if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
+                                         poll_val |                            \
+                                             KMP_LOCK_BUSY(1, futex))) {       \
+          continue;                                                            \
+        }                                                                      \
+        poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
+      }                                                                        \
+      kmp_int32 rc;                                                            \
+      if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
+                        NULL, NULL, 0)) != 0) {                                \
+        continue;                                                              \
+      }                                                                        \
+      gtid_code |= 1;                                                          \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(ftx);                                                   \
+  }
+
+// Fast-path test futex lock
+#define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
+  {                                                                            \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
+    if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
+                                    KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
+      KMP_FSYNC_ACQUIRED(ftx);                                                 \
+      rc = TRUE;                                                               \
+    } else {                                                                   \
+      rc = FALSE;                                                              \
+    }                                                                          \
+  }
+
+// Fast-path release futex lock
+#define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
+  {                                                                            \
+    kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
+    KMP_MB();                                                                  \
+    KMP_FSYNC_RELEASING(ftx);                                                  \
+    kmp_int32 poll_val =                                                       \
+        KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
+    if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
+      syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
+              KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
+    }                                                                          \
+    KMP_MB();                                                                  \
+    KMP_YIELD_OVERSUB();                                                       \
+  }
+
+#endif // KMP_USE_FUTEX
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
+                                                      ident_t const *loc,
+                                                      kmp_int32 gtid) {
+  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
+
+  // Because of the double-check, the following load doesn't need to be volatile
+  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
+
+  if (lck == NULL) {
+    void *idx;
+
+    // Allocate & initialize the lock.
+    // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
+    lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
+    __kmp_init_user_lock_with_checks(lck);
+    __kmp_set_user_lock_location(lck, loc);
+#if USE_ITT_BUILD
+    __kmp_itt_critical_creating(lck);
+// __kmp_itt_critical_creating() should be called *before* the first usage
+// of underlying lock. It is the only place where we can guarantee it. There
+// are chances the lock will destroyed with no usage, but it is not a
+// problem, because this is not real event seen by user but rather setting
+// name for object (lock). See more details in kmp_itt.h.
+#endif /* USE_ITT_BUILD */
+
+    // Use a cmpxchg instruction to slam the start of the critical section with
+    // the lock pointer.  If another thread beat us to it, deallocate the lock,
+    // and use the lock that the other thread allocated.
+    int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
+
+    if (status == 0) {
+// Deallocate the lock and reload the value.
+#if USE_ITT_BUILD
+      __kmp_itt_critical_destroyed(lck);
+// Let ITT know the lock is destroyed and the same memory location may be reused
+// for another purpose.
+#endif /* USE_ITT_BUILD */
+      __kmp_destroy_user_lock_with_checks(lck);
+      __kmp_user_lock_free(&idx, gtid, lck);
+      lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
+      KMP_DEBUG_ASSERT(lck != NULL);
+    }
+  }
+  return lck;
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@param crit identity of the critical section. This could be a pointer to a lock
+associated with the critical section, or some other suitably unique value.
+
+Enter code protected by a `critical` construct.
+This function blocks until the executing thread can enter the critical section.
+*/
+void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
+                     kmp_critical_name *crit) {
+#if KMP_USE_DYNAMIC_LOCK
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+#endif // OMPT_SUPPORT
+  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
+#else
+  KMP_COUNT_BLOCK(OMP_CRITICAL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_state_t prev_state = ompt_state_undefined;
+  ompt_thread_info_t ti;
+#endif
+  kmp_user_lock_p lck;
+
+  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+
+  // TODO: add THR_OVHD_STATE
+
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
+  KMP_CHECK_USER_LOCK_INIT();
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#endif
+  else { // ticket, queuing or drdpa
+    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_critical, loc, lck);
+
+// since the critical directive binds to all threads, not just the current
+// team we have to check this even if we are in a serialized team.
+// also, even if we are the uber thread, we still have to conduct the lock,
+// as we have to contend with sibling threads.
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+  void *codeptr_ra = NULL;
+  if (ompt_enabled.enabled) {
+    ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+    /* OMPT state update */
+    prev_state = ti.state;
+    ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
+    ti.state = ompt_state_wait_critical;
+
+    /* OMPT event callback */
+    codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+          (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
+    }
+  }
+#endif
+  // Value of 'crit' should be good for using as a critical_id of the critical
+  // section directive.
+  __kmp_acquire_user_lock_with_checks(lck, global_tid);
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_acquired(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    ti.state = prev_state;
+    ti.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
+    }
+  }
+#endif
+  KMP_POP_PARTITIONED_TIMER();
+
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
+  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// Converts the given hint to an internal lock implementation
+static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
+#if KMP_USE_TSX
+#define KMP_TSX_LOCK(seq) lockseq_##seq
+#else
+#define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
+#else
+#define KMP_CPUINFO_RTM 0
+#endif
+
+  // Hints that do not require further logic
+  if (hint & kmp_lock_hint_hle)
+    return KMP_TSX_LOCK(hle);
+  if (hint & kmp_lock_hint_rtm)
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
+  if (hint & kmp_lock_hint_adaptive)
+    return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
+
+  // Rule out conflicting hints first by returning the default lock
+  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
+    return __kmp_user_lock_seq;
+  if ((hint & omp_lock_hint_speculative) &&
+      (hint & omp_lock_hint_nonspeculative))
+    return __kmp_user_lock_seq;
+
+  // Do not even consider speculation when it appears to be contended
+  if (hint & omp_lock_hint_contended)
+    return lockseq_queuing;
+
+  // Uncontended lock without speculation
+  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
+    return lockseq_tas;
+
+  // HLE lock for speculation
+  if (hint & omp_lock_hint_speculative)
+    return KMP_TSX_LOCK(hle);
+
+  return __kmp_user_lock_seq;
+}
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#if KMP_USE_DYNAMIC_LOCK
+static kmp_mutex_impl_t
+__ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
+  if (user_lock) {
+    switch (KMP_EXTRACT_D_TAG(user_lock)) {
+    case 0:
+      break;
+#if KMP_USE_FUTEX
+    case locktag_futex:
+      return kmp_mutex_impl_queuing;
+#endif
+    case locktag_tas:
+      return kmp_mutex_impl_spin;
+#if KMP_USE_TSX
+    case locktag_hle:
+      return kmp_mutex_impl_speculative;
+#endif
+    default:
+      return kmp_mutex_impl_none;
+    }
+    ilock = KMP_LOOKUP_I_LOCK(user_lock);
+  }
+  KMP_ASSERT(ilock);
+  switch (ilock->type) {
+#if KMP_USE_TSX
+  case locktag_adaptive:
+  case locktag_rtm:
+    return kmp_mutex_impl_speculative;
+#endif
+  case locktag_nested_tas:
+    return kmp_mutex_impl_spin;
+#if KMP_USE_FUTEX
+  case locktag_nested_futex:
+#endif
+  case locktag_ticket:
+  case locktag_queuing:
+  case locktag_drdpa:
+  case locktag_nested_ticket:
+  case locktag_nested_queuing:
+  case locktag_nested_drdpa:
+    return kmp_mutex_impl_queuing;
+  default:
+    return kmp_mutex_impl_none;
+  }
+}
+#else
+// For locks without dynamic binding
+static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
+  switch (__kmp_user_lock_kind) {
+  case lk_tas:
+    return kmp_mutex_impl_spin;
+#if KMP_USE_FUTEX
+  case lk_futex:
+#endif
+  case lk_ticket:
+  case lk_queuing:
+  case lk_drdpa:
+    return kmp_mutex_impl_queuing;
+#if KMP_USE_TSX
+  case lk_hle:
+  case lk_rtm:
+  case lk_adaptive:
+    return kmp_mutex_impl_speculative;
+#endif
+  default:
+    return kmp_mutex_impl_none;
+  }
+}
+#endif // KMP_USE_DYNAMIC_LOCK
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number.
+@param crit identity of the critical section. This could be a pointer to a lock
+associated with the critical section, or some other suitably unique value.
+@param hint the lock hint.
+
+Enter code protected by a `critical` construct with a hint. The hint value is
+used to suggest a lock implementation. This function blocks until the executing
+thread can enter the critical section unless the hint suggests use of
+speculative execution and the hardware supports it.
+*/
+void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
+                               kmp_critical_name *crit, uint32_t hint) {
+  KMP_COUNT_BLOCK(OMP_CRITICAL);
+  kmp_user_lock_p lck;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_state_t prev_state = ompt_state_undefined;
+  ompt_thread_info_t ti;
+  // This is the case, if called from __kmpc_critical:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+#endif
+
+  KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
+
+  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
+  // Check if it is initialized.
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
+  if (*lk == 0) {
+    kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
+    if (KMP_IS_D_LOCK(lckseq)) {
+      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
+                                  KMP_GET_D_TAG(lckseq));
+    } else {
+      __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
+    }
+  }
+  // Branch for accessing the actual lock object and set operation. This
+  // branching is inevitable since this lock initialization does not follow the
+  // normal dispatch path (lock table is not used).
+  if (KMP_EXTRACT_D_TAG(lk) != 0) {
+    lck = (kmp_user_lock_p)lk;
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck,
+                      __kmp_map_hint_to_lock(hint));
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_acquiring(lck);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+      /* OMPT state update */
+      prev_state = ti.state;
+      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
+      ti.state = ompt_state_wait_critical;
+
+      /* OMPT event callback */
+      if (ompt_enabled.ompt_callback_mutex_acquire) {
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+            ompt_mutex_critical, (unsigned int)hint,
+            __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
+            codeptr);
+      }
+    }
+#endif
+#if KMP_USE_INLINED_TAS
+    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+      KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
+    } else
+#elif KMP_USE_INLINED_FUTEX
+    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+      KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
+    } else
+#endif
+    {
+      KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
+    }
+  } else {
+    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
+    lck = ilk->lock;
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck,
+                      __kmp_map_hint_to_lock(hint));
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_acquiring(lck);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ti = __kmp_threads[global_tid]->th.ompt_thread_info;
+      /* OMPT state update */
+      prev_state = ti.state;
+      ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
+      ti.state = ompt_state_wait_critical;
+
+      /* OMPT event callback */
+      if (ompt_enabled.ompt_callback_mutex_acquire) {
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+            ompt_mutex_critical, (unsigned int)hint,
+            __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
+            codeptr);
+      }
+    }
+#endif
+    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
+  }
+  KMP_POP_PARTITIONED_TIMER();
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_acquired(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    /* OMPT state update */
+    ti.state = prev_state;
+    ti.wait_id = 0;
+
+    /* OMPT event callback */
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+    }
+  }
+#endif
+
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
+  KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
+} // __kmpc_critical_with_hint
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param global_tid  global thread number .
+@param crit identity of the critical section. This could be a pointer to a lock
+associated with the critical section, or some other suitably unique value.
+
+Leave a critical section, releasing any lock that was held during its execution.
+*/
+void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
+                         kmp_critical_name *crit) {
+  kmp_user_lock_p lck;
+
+  KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
+
+#if KMP_USE_DYNAMIC_LOCK
+  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+    lck = (kmp_user_lock_p)crit;
+    KMP_ASSERT(lck != NULL);
+    if (__kmp_env_consistency_check) {
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_releasing(lck);
+#endif
+#if KMP_USE_INLINED_TAS
+    if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+      KMP_RELEASE_TAS_LOCK(lck, global_tid);
+    } else
+#elif KMP_USE_INLINED_FUTEX
+    if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+      KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
+    } else
+#endif
+    {
+      KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+    }
+  } else {
+    kmp_indirect_lock_t *ilk =
+        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+    KMP_ASSERT(ilk != NULL);
+    lck = ilk->lock;
+    if (__kmp_env_consistency_check) {
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    }
+#if USE_ITT_BUILD
+    __kmp_itt_critical_releasing(lck);
+#endif
+    KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
+    lck = (kmp_user_lock_p)crit;
+  }
+#endif
+  else { // ticket, queuing or drdpa
+    lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
+  }
+
+  KMP_ASSERT(lck != NULL);
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_critical, loc);
+
+#if USE_ITT_BUILD
+  __kmp_itt_critical_releasing(lck);
+#endif /* USE_ITT_BUILD */
+  // Value of 'crit' should be good for using as a critical_id of the critical
+  // section directive.
+  __kmp_release_user_lock_with_checks(lck, global_tid);
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  /* OMPT release event triggers after lock is released; place here to trigger
+   * for all #if branches */
+  OMPT_STORE_RETURN_ADDRESS(global_tid);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
+        OMPT_LOAD_RETURN_ADDRESS(0));
+  }
+#endif
+
+  KMP_POP_PARTITIONED_TIMER();
+  KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master. The barrier is executed inside
+this function.
+*/
+kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
+  int status;
+
+  KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (__kmp_env_consistency_check)
+    __kmp_check_barrier(global_tid, ct_barrier, loc);
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+  }
+#endif
+#if USE_ITT_NOTIFY
+  __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+  status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  return (status != 0) ? 0 : 1;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Complete the execution of a combined barrier and master. This function should
+only be called at the completion of the <tt>master</tt> code. Other threads will
+still be waiting at the barrier and this call releases them.
+*/
+void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
+  KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
+
+  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master(nowait) construct.
+The barrier is executed inside this function.
+There is no equivalent "end" function, since the
+*/
+kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
+  kmp_int32 ret;
+
+  KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
+    }
+    __kmp_check_barrier(global_tid, ct_barrier, loc);
+  }
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(global_tid);
+  }
+#endif
+#if USE_ITT_NOTIFY
+  __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+  __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  ret = __kmpc_master(loc, global_tid);
+
+  if (__kmp_env_consistency_check) {
+    /*  there's no __kmpc_end_master called; so the (stats) */
+    /*  actions of __kmpc_end_master are done here          */
+
+    if (global_tid < 0) {
+      KMP_WARNING(ThreadIdentInvalid);
+    }
+    if (ret) {
+      /* only one thread should do the pop since only */
+      /* one did the push (see __kmpc_master())       */
+
+      __kmp_pop_sync(global_tid, ct_master, loc);
+    }
+  }
+
+  return (ret);
+}
+
+/* The BARRIER for a SINGLE process section is always explicit   */
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+@return One if this thread should execute the single construct, zero otherwise.
+
+Test whether to execute a <tt>single</tt> construct.
+There are no implicit barriers in the two "single" calls, rather the compiler
+should introduce an explicit barrier if it is required.
+*/
+
+kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
+  kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
+
+  if (rc) {
+    // We are going to execute the single statement, so we should count it.
+    KMP_COUNT_BLOCK(OMP_SINGLE);
+    KMP_PUSH_PARTITIONED_TIMER(OMP_single);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(global_tid);
+
+  if (ompt_enabled.enabled) {
+    if (rc) {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_executor, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_end,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    }
+  }
+#endif
+
+  return rc;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information
+@param global_tid  global thread number
+
+Mark the end of a <tt>single</tt> construct.  This function should
+only be called by the thread that executed the block of code protected
+by the `single` construct.
+*/
+void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
+  __kmp_exit_single(global_tid);
+  KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[global_tid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(global_tid);
+
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_single_executor, ompt_scope_end,
+        &(team->t.ompt_team_info.parallel_data),
+        &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
+        OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc Source location
+@param global_tid Global thread id
+
+Mark the end of a statically scheduled loop.
+*/
+void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
+  KMP_POP_PARTITIONED_TIMER();
+  KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_work_t ompt_work_type = ompt_work_loop;
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    // Determine workshare type
+    if (loc != NULL) {
+      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
+        ompt_work_type = ompt_work_loop;
+      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
+        ompt_work_type = ompt_work_sections;
+      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
+        ompt_work_type = ompt_work_distribute;
+      } else {
+        // use default set above.
+        // a warning about this case is provided in __kmpc_for_static_init
+      }
+      KMP_DEBUG_ASSERT(ompt_work_type);
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
+        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+  if (__kmp_env_consistency_check)
+    __kmp_pop_workshare(global_tid, ct_pdo, loc);
+}
+
+// User routines which take C-style arguments (call by value)
+// different from the Fortran equivalent routines
+
+void ompc_set_num_threads(int arg) {
+  // !!!!! TODO: check the per-task binding
+  __kmp_set_num_threads(arg, __kmp_entry_gtid());
+}
+
+void ompc_set_dynamic(int flag) {
+  kmp_info_t *thread;
+
+  /* For the thread-private implementation of the internal controls */
+  thread = __kmp_entry_thread();
+
+  __kmp_save_internal_controls(thread);
+
+  set__dynamic(thread, flag ? TRUE : FALSE);
+}
+
+void ompc_set_nested(int flag) {
+  kmp_info_t *thread;
+
+  /* For the thread-private internal controls implementation */
+  thread = __kmp_entry_thread();
+
+  __kmp_save_internal_controls(thread);
+
+  set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
+}
+
+void ompc_set_max_active_levels(int max_active_levels) {
+  /* TO DO */
+  /* we want per-task implementation of this internal control */
+
+  /* For the per-thread internal controls implementation */
+  __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
+}
+
+void ompc_set_schedule(omp_sched_t kind, int modifier) {
+  // !!!!! TODO: check the per-task binding
+  __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
+}
+
+int ompc_get_ancestor_thread_num(int level) {
+  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
+}
+
+int ompc_get_team_size(int level) {
+  return __kmp_get_team_size(__kmp_entry_gtid(), level);
+}
+
+/* OpenMP 5.0 Affinity Format API */
+
+void ompc_set_affinity_format(char const *format) {
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
+                         format, KMP_STRLEN(format) + 1);
+}
+
+size_t ompc_get_affinity_format(char *buffer, size_t size) {
+  size_t format_size;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  format_size = KMP_STRLEN(__kmp_affinity_format);
+  if (buffer && size) {
+    __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
+                           format_size + 1);
+  }
+  return format_size;
+}
+
+void ompc_display_affinity(char const *format) {
+  int gtid;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  gtid = __kmp_get_gtid();
+  __kmp_aux_display_affinity(gtid, format);
+}
+
+size_t ompc_capture_affinity(char *buffer, size_t buf_size,
+                             char const *format) {
+  int gtid;
+  size_t num_required;
+  kmp_str_buf_t capture_buf;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  gtid = __kmp_get_gtid();
+  __kmp_str_buf_init(&capture_buf);
+  num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
+  if (buffer && buf_size) {
+    __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
+                           capture_buf.used + 1);
+  }
+  __kmp_str_buf_free(&capture_buf);
+  return num_required;
+}
+
+void kmpc_set_stacksize(int arg) {
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize(arg);
+}
+
+void kmpc_set_stacksize_s(size_t arg) {
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize(arg);
+}
+
+void kmpc_set_blocktime(int arg) {
+  int gtid, tid;
+  kmp_info_t *thread;
+
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+  thread = __kmp_thread_from_gtid(gtid);
+
+  __kmp_aux_set_blocktime(arg, thread, tid);
+}
+
+void kmpc_set_library(int arg) {
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library((enum library_type)arg);
+}
+
+void kmpc_set_defaults(char const *str) {
+  // __kmp_aux_set_defaults initializes the library if needed
+  __kmp_aux_set_defaults(str, KMP_STRLEN(str));
+}
+
+void kmpc_set_disp_num_buffers(int arg) {
+  // ignore after initialization because some teams have already
+  // allocated dispatch buffers
+  if (__kmp_init_serial == 0 && arg > 0)
+    __kmp_dispatch_num_buffers = arg;
+}
+
+int kmpc_set_affinity_mask_proc(int proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_set_affinity_mask_proc(proc, mask);
+#endif
+}
+
+int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_unset_affinity_mask_proc(proc, mask);
+#endif
+}
+
+int kmpc_get_affinity_mask_proc(int proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_get_affinity_mask_proc(proc, mask);
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+/*!
+@ingroup THREADPRIVATE
+@param loc       source location information
+@param gtid      global thread number
+@param cpy_size  size of the cpy_data buffer
+@param cpy_data  pointer to data to be copied
+@param cpy_func  helper function to call for copying data
+@param didit     flag variable: 1=single thread; 0=not single thread
+
+__kmpc_copyprivate implements the interface for the private data broadcast
+needed for the copyprivate clause associated with a single region in an
+OpenMP<sup>*</sup> program (both C and Fortran).
+All threads participating in the parallel region call this routine.
+One of the threads (called the single thread) should have the <tt>didit</tt>
+variable set to 1 and all other threads should have that variable set to 0.
+All threads pass a pointer to a data buffer (cpy_data) that they have built.
+
+The OpenMP specification forbids the use of nowait on the single region when a
+copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
+barrier internally to avoid race conditions, so the code generation for the
+single region should avoid generating a barrier after the call to @ref
+__kmpc_copyprivate.
+
+The <tt>gtid</tt> parameter is the global thread id for the current thread.
+The <tt>loc</tt> parameter is a pointer to source location information.
+
+Internal implementation: The single thread will first copy its descriptor
+address (cpy_data) to a team-private location, then the other threads will each
+call the function pointed to by the parameter cpy_func, which carries out the
+copy by copying the data using the cpy_data buffer.
+
+The cpy_func routine used for the copy and the contents of the data area defined
+by cpy_data and cpy_size may be built in any fashion that will allow the copy
+to be done. For instance, the cpy_data buffer can hold the actual data to be
+copied or it may hold a list of pointers to the data. The cpy_func routine must
+interpret the cpy_data buffer appropriately.
+
+The interface to cpy_func is as follows:
+@code
+void cpy_func( void *destination, void *source )
+@endcode
+where void *destination is the cpy_data pointer for the thread being copied to
+and void *source is the cpy_data pointer for the thread being copied from.
+*/
+void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
+                        void *cpy_data, void (*cpy_func)(void *, void *),
+                        kmp_int32 didit) {
+  void **data_ptr;
+
+  KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
+
+  KMP_MB();
+
+  data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+
+  if (__kmp_env_consistency_check) {
+    if (loc == 0) {
+      KMP_WARNING(ConstructIdentInvalid);
+    }
+  }
+
+  // ToDo: Optimize the following two barriers into some kind of split barrier
+
+  if (didit)
+    *data_ptr = cpy_data;
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    if (ompt_frame->enter_frame.ptr == NULL)
+      ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+/* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+  __kmp_threads[gtid]->th.th_ident = loc;
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  if (!didit)
+    (*cpy_func)(cpy_data, *data_ptr);
+
+// Consider next barrier a user-visible barrier for barrier region boundaries
+// Nesting checks are already handled by the single construct checks
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+#if USE_ITT_NOTIFY
+  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
+// tasks can overwrite the location)
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define INIT_LOCK __kmp_init_user_lock_with_checks
+#define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
+#define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
+#define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
+#define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
+#define ACQUIRE_NESTED_LOCK_TIMED                                              \
+  __kmp_acquire_nested_user_lock_with_checks_timed
+#define RELEASE_LOCK __kmp_release_user_lock_with_checks
+#define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
+#define TEST_LOCK __kmp_test_user_lock_with_checks
+#define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
+#define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
+#define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
+
+// TODO: Make check abort messages use location info & pass it into
+// with_checks routines
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// internal lock initializer
+static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
+                                                    kmp_dyna_lockseq_t seq) {
+  if (KMP_IS_D_LOCK(seq)) {
+    KMP_INIT_D_LOCK(lock, seq);
+#if USE_ITT_BUILD
+    __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
+#endif
+  } else {
+    KMP_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+    kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+    __kmp_itt_lock_creating(ilk->lock, loc);
+#endif
+  }
+}
+
+// internal nest lock initializer
+static __forceinline void
+__kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
+                               kmp_dyna_lockseq_t seq) {
+#if KMP_USE_TSX
+  // Don't have nested lock implementation for speculative locks
+  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
+    seq = __kmp_user_lock_seq;
+#endif
+  switch (seq) {
+  case lockseq_tas:
+    seq = lockseq_nested_tas;
+    break;
+#if KMP_USE_FUTEX
+  case lockseq_futex:
+    seq = lockseq_nested_futex;
+    break;
+#endif
+  case lockseq_ticket:
+    seq = lockseq_nested_ticket;
+    break;
+  case lockseq_queuing:
+    seq = lockseq_nested_queuing;
+    break;
+  case lockseq_drdpa:
+    seq = lockseq_nested_drdpa;
+    break;
+  default:
+    seq = lockseq_nested_queuing;
+  }
+  KMP_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+  __kmp_itt_lock_creating(ilk->lock, loc);
+#endif
+}
+
+/* initialize the lock with a hint */
+void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
+                                uintptr_t hint) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
+  }
+
+  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, (omp_lock_hint_t)hint,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+}
+
+/* initialize the lock with a hint */
+void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
+                                     void **user_lock, uintptr_t hint) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
+  }
+
+  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/* initialize the lock */
+void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_lock");
+  }
+  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  static char const *const func = "omp_init_lock";
+  kmp_user_lock_p lck;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (__kmp_env_consistency_check) {
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  KMP_CHECK_USER_LOCK_INIT();
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
+  }
+  INIT_LOCK(lck);
+  __kmp_set_user_lock_location(lck, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_creating(lck);
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_lock
+
+/* initialize the lock */
+void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  if (__kmp_env_consistency_check && user_lock == NULL) {
+    KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
+  }
+  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  static char const *const func = "omp_init_nest_lock";
+  kmp_user_lock_p lck;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (__kmp_env_consistency_check) {
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  KMP_CHECK_USER_LOCK_INIT();
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
+  }
+
+  INIT_NESTED_LOCK(lck);
+  __kmp_set_user_lock_location(lck, loc);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_init) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
+        ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_creating(lck);
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_nest_lock
+
+void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  kmp_user_lock_p lck;
+  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
+    lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
+  } else {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+  __kmp_itt_lock_destroyed(lck);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    kmp_user_lock_p lck;
+    if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
+      lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
+    } else {
+      lck = (kmp_user_lock_p)user_lock;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+#else
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_destroyed(lck);
+#endif /* USE_ITT_BUILD */
+  DESTROY_LOCK(lck);
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    ;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    ;
+  }
+#endif
+  else {
+    __kmp_user_lock_free(user_lock, gtid, lck);
+  }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_lock
+
+/* destroy the lock */
+void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
+  __kmp_itt_lock_destroyed(ilk->lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_lock_destroy) {
+    ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
+        ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_destroyed(lck);
+#endif /* USE_ITT_BUILD */
+
+  DESTROY_NESTED_LOCK(lck);
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    ;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    ;
+  }
+#endif
+  else {
+    __kmp_user_lock_free(user_lock, gtid, lck);
+  }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_nest_lock
+
+void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+  KMP_COUNT_BLOCK(OMP_set_lock);
+#if KMP_USE_DYNAMIC_LOCK
+  int tag = KMP_EXTRACT_D_TAG(user_lock);
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(
+      (kmp_user_lock_p)
+          user_lock); // itt function will get to the right lock object.
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+#if KMP_USE_INLINED_TAS
+  if (tag == locktag_tas && !__kmp_env_consistency_check) {
+    KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
+  } else
+#elif KMP_USE_INLINED_FUTEX
+  if (tag == locktag_futex && !__kmp_env_consistency_check) {
+    KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
+  } else
+#endif
+  {
+    __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+  }
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+  ACQUIRE_LOCK(lck, gtid);
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired(lck);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(user_lock),
+          (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+    }
+  }
+#endif
+  int acquire_status =
+      KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
+  (void) acquire_status;
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
+            codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+      }
+    }
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+  int acquire_status;
+  kmp_user_lock_p lck;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
+          codeptr);
+    }
+  }
+#endif
+
+  ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquired(lck);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    }
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+  int tag = KMP_EXTRACT_D_TAG(user_lock);
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif
+#if KMP_USE_INLINED_TAS
+  if (tag == locktag_tas && !__kmp_env_consistency_check) {
+    KMP_RELEASE_TAS_LOCK(user_lock, gtid);
+  } else
+#elif KMP_USE_INLINED_FUTEX
+  if (tag == locktag_futex && !__kmp_env_consistency_check) {
+    KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
+  } else
+#endif
+  {
+    __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  /* Can't use serial interval since not block structured */
+  /* release the lock */
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+// "fast" path implemented to fix customer performance issue
+#if USE_ITT_BUILD
+    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif /* USE_ITT_BUILD */
+    TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
+    KMP_MB();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    // This is the case, if called from omp_init_lock_with_hint:
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_mutex_released) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+    }
+#endif
+
+    return;
+#else
+    lck = (kmp_user_lock_p)user_lock;
+#endif
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing(lck);
+#endif /* USE_ITT_BUILD */
+
+  RELEASE_LOCK(lck, gtid);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_released) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* release the lock */
+void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif
+  int release_status =
+      KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
+  (void) release_status;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (release_status == KMP_LOCK_RELEASED) {
+      if (ompt_enabled.ompt_callback_mutex_released) {
+        // release_lock_last
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
+            codeptr);
+      }
+    } else if (ompt_enabled.ompt_callback_nest_lock) {
+      // release_lock_prev
+      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+          ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+    }
+  }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+
+  /* Can't use serial interval since not block structured */
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+    // "fast" path implemented to fix customer performance issue
+    kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
+#if USE_ITT_BUILD
+    __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    int release_status = KMP_LOCK_STILL_HELD;
+#endif
+
+    if (--(tl->lk.depth_locked) == 0) {
+      TCW_4(tl->lk.poll, 0);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+      release_status = KMP_LOCK_RELEASED;
+#endif
+    }
+    KMP_MB();
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    // This is the case, if called from omp_init_lock_with_hint:
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    if (ompt_enabled.enabled) {
+      if (release_status == KMP_LOCK_RELEASED) {
+        if (ompt_enabled.ompt_callback_mutex_released) {
+          // release_lock_last
+          ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+              ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+        }
+      } else if (ompt_enabled.ompt_callback_nest_lock) {
+        // release_lock_previous
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    }
+#endif
+
+    return;
+#else
+    lck = (kmp_user_lock_p)user_lock;
+#endif
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_releasing(lck);
+#endif /* USE_ITT_BUILD */
+
+  int release_status;
+  release_status = RELEASE_NESTED_LOCK(lck, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) {
+    if (release_status == KMP_LOCK_RELEASED) {
+      if (ompt_enabled.ompt_callback_mutex_released) {
+        // release_lock_last
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    } else if (ompt_enabled.ompt_callback_nest_lock) {
+      // release_lock_previous
+      ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+          ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+    }
+  }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+  KMP_COUNT_BLOCK(OMP_test_lock);
+
+#if KMP_USE_DYNAMIC_LOCK
+  int rc;
+  int tag = KMP_EXTRACT_D_TAG(user_lock);
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+#if KMP_USE_INLINED_TAS
+  if (tag == locktag_tas && !__kmp_env_consistency_check) {
+    KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
+  } else
+#elif KMP_USE_INLINED_FUTEX
+  if (tag == locktag_futex && !__kmp_env_consistency_check) {
+    KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
+  } else
+#endif
+  {
+    rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+  }
+  if (rc) {
+#if USE_ITT_BUILD
+    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_mutex_acquired) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+          ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+    }
+#endif
+    return FTN_TRUE;
+  } else {
+#if USE_ITT_BUILD
+    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+#endif
+    return FTN_FALSE;
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+  int rc;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
+        (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+  rc = TEST_LOCK(lck, gtid);
+#if USE_ITT_BUILD
+  if (rc) {
+    __kmp_itt_lock_acquired(lck);
+  } else {
+    __kmp_itt_lock_cancelled(lck);
+  }
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+        ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+  }
+#endif
+
+  return (rc ? FTN_TRUE : FTN_FALSE);
+
+/* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
+#if KMP_USE_DYNAMIC_LOCK
+  int rc;
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.ompt_callback_mutex_acquire) {
+    ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+        ompt_mutex_nest_lock, omp_lock_hint_none,
+        __ompt_get_mutex_impl_type(user_lock),
+        (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+  }
+#endif
+  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
+#if USE_ITT_BUILD
+  if (rc) {
+    __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+  } else {
+    __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+  }
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && rc) {
+    if (rc == 1) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
+            codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
+      }
+    }
+  }
+#endif
+  return rc;
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  kmp_user_lock_p lck;
+  int rc;
+
+  if ((__kmp_user_lock_kind == lk_tas) &&
+      (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
+       OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#if KMP_USE_FUTEX
+  else if ((__kmp_user_lock_kind == lk_futex) &&
+           (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
+            OMP_NEST_LOCK_T_SIZE)) {
+    lck = (kmp_user_lock_p)user_lock;
+  }
+#endif
+  else {
+    lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_lock_acquiring(lck);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  // This is the case, if called from omp_init_lock_with_hint:
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+  if (!codeptr)
+    codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  if (ompt_enabled.enabled) &&
+        ompt_enabled.ompt_callback_mutex_acquire) {
+      ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
+          ompt_mutex_nest_lock, omp_lock_hint_none,
+          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
+          codeptr);
+    }
+#endif
+
+  rc = TEST_NESTED_LOCK(lck, gtid);
+#if USE_ITT_BUILD
+  if (rc) {
+    __kmp_itt_lock_acquired(lck);
+  } else {
+    __kmp_itt_lock_cancelled(lck);
+  }
+#endif /* USE_ITT_BUILD */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled && rc) {
+    if (rc == 1) {
+      if (ompt_enabled.ompt_callback_mutex_acquired) {
+        // lock_first
+        ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
+            ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_nest_lock) {
+        // lock_next
+        ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
+            ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
+      }
+    }
+  }
+#endif
+  return rc;
+
+/* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+// Interface to fast scalable reduce methods routines
+
+// keep the selected method in a thread local structure for cross-function
+// usage: will be used in __kmpc_end_reduce* functions;
+// another solution: to re-determine the method one more time in
+// __kmpc_end_reduce* functions (new prototype required then)
+// AT: which solution is better?
+#define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
+  ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
+
+#define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
+  (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
+
+// description of the packed_reduction_method variable: look at the macros in
+// kmp.h
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
+                                          kmp_critical_name *crit) {
+
+  // this lock was visible to a customer and to the threading profile tool as a
+  // serial overhead span (although it's used for an internal purpose only)
+  //            why was it visible in previous implementation?
+  //            should we keep it visible in new reduce block?
+  kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
+  // Check if it is initialized.
+  if (*lk == 0) {
+    if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+      KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
+                                  KMP_GET_D_TAG(__kmp_user_lock_seq));
+    } else {
+      __kmp_init_indirect_csptr(crit, loc, global_tid,
+                                KMP_GET_I_TAG(__kmp_user_lock_seq));
+    }
+  }
+  // Branch for accessing the actual lock object and set operation. This
+  // branching is inevitable since this lock initialization does not follow the
+  // normal dispatch path (lock table is not used).
+  if (KMP_EXTRACT_D_TAG(lk) != 0) {
+    lck = (kmp_user_lock_p)lk;
+    KMP_DEBUG_ASSERT(lck != NULL);
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+    }
+    KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
+  } else {
+    kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
+    lck = ilk->lock;
+    KMP_DEBUG_ASSERT(lck != NULL);
+    if (__kmp_env_consistency_check) {
+      __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+    }
+    KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  // We know that the fast reduction code is only emitted by Intel compilers
+  // with 32 byte critical sections. If there isn't enough space, then we
+  // have to use a pointer.
+  if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
+    lck = (kmp_user_lock_p)crit;
+  } else {
+    lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
+  }
+  KMP_DEBUG_ASSERT(lck != NULL);
+
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_critical, loc, lck);
+
+  __kmp_acquire_user_lock_with_checks(lck, global_tid);
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
+                                        kmp_critical_name *crit) {
+
+  kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+    lck = (kmp_user_lock_p)crit;
+    if (__kmp_env_consistency_check)
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+  } else {
+    kmp_indirect_lock_t *ilk =
+        (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+    if (__kmp_env_consistency_check)
+      __kmp_pop_sync(global_tid, ct_critical, loc);
+    KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
+  }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+  // We know that the fast reduction code is only emitted by Intel compilers
+  // with 32 byte critical sections. If there isn't enough space, then we have
+  // to use a pointer.
+  if (__kmp_base_user_lock_size > 32) {
+    lck = *((kmp_user_lock_p *)crit);
+    KMP_ASSERT(lck != NULL);
+  } else {
+    lck = (kmp_user_lock_p)crit;
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_critical, loc);
+
+  __kmp_release_user_lock_with_checks(lck, global_tid);
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmp_end_critical_section_reduce_block
+
+static __forceinline int
+__kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
+                                     int *task_state) {
+  kmp_team_t *team;
+
+  // Check if we are inside the teams construct?
+  if (th->th.th_teams_microtask) {
+    *team_p = team = th->th.th_team;
+    if (team->t.t_level == th->th.th_teams_level) {
+      // This is reduction at teams construct.
+      KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
+      // Let's swap teams temporarily for the reduction.
+      th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+      th->th.th_team = team->t.t_parent;
+      th->th.th_team_nproc = th->th.th_team->t.t_nproc;
+      th->th.th_task_team = th->th.th_team->t.t_task_team[0];
+      *task_state = th->th.th_task_state;
+      th->th.th_task_state = 0;
+
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static __forceinline void
+__kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
+  // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
+  th->th.th_info.ds.ds_tid = 0;
+  th->th.th_team = team;
+  th->th.th_team_nproc = team->t.t_nproc;
+  th->th.th_task_team = team->t.t_task_team[task_state];
+  th->th.th_task_state = task_state;
+}
+
+/* 2.a.i. Reduce Block without a terminating barrier */
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two
+operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the master thread, 0 for all other team threads, 2 for all team
+threads if atomic reduction needed
+
+The nowait version is used for a reduce clause with the nowait argument.
+*/
+kmp_int32
+__kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
+                     size_t reduce_size, void *reduce_data,
+                     void (*reduce_func)(void *lhs_data, void *rhs_data),
+                     kmp_critical_name *lck) {
+
+  KMP_COUNT_BLOCK(REDUCE_nowait);
+  int retval = 0;
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int teams_swapped = 0, task_state;
+  KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
+
+  // why do we need this initialization here at all?
+  // Reduction clause can not be used as a stand-alone directive.
+
+  // do not call __kmp_serial_initialize(), it will be called by
+  // __kmp_parallel_initialize() if needed
+  // possible detection of false-positive race by the threadchecker ???
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+// check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
+#else
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
+#endif
+
+  th = __kmp_thread_from_gtid(global_tid);
+  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
+
+  // packed_reduction_method value will be reused by __kmp_end_reduce* function,
+  // the value should be kept in a variable
+  // the variable should be either a construct-specific or thread-specific
+  // property, not a team specific property
+  //     (a thread can reach the next reduce block on the next construct, reduce
+  //     method may differ on the next construct)
+  // an ident_t "loc" parameter could be used as a construct-specific property
+  // (what if loc == 0?)
+  //     (if both construct-specific and team-specific variables were shared,
+  //     then unness extra syncs should be needed)
+  // a thread-specific variable is better regarding two issues above (next
+  // construct and extra syncs)
+  // a thread-specific "th_local.reduction_method" variable is used currently
+  // each thread executes 'determine' and 'set' lines (no need to execute by one
+  // thread, to avoid unness extra syncs)
+
+  packed_reduction_method = __kmp_determine_reduction_method(
+      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
+  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+
+  if (packed_reduction_method == critical_reduce_block) {
+
+    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
+    retval = 1;
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+    // usage: if team size == 1, no synchronization is required ( Intel
+    // platforms only )
+    retval = 1;
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+    retval = 2;
+
+    // all threads should do this pop here (because __kmpc_end_reduce_nowait()
+    // won't be called by the code gen)
+    //     (it's not quite good, because the checking block has been closed by
+    //     this 'pop',
+    //      but atomic operation has not been executed yet, will be executed
+    //      slightly later, literally on next instruction)
+    if (__kmp_env_consistency_check)
+      __kmp_pop_sync(global_tid, ct_reduce, loc);
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+// AT: performance issue: a real barrier here
+// AT:     (if master goes slow, other threads are blocked here waiting for the
+// master to come and release them)
+// AT:     (it's not what a customer might expect specifying NOWAIT clause)
+// AT:     (specifying NOWAIT won't result in improvement of performance, it'll
+// be confusing to a customer)
+// AT: another implementation of *barrier_gather*nowait() (or some other design)
+// might go faster and be more in line with sense of NOWAIT
+// AT: TO DO: do epcc test and compare times
+
+// this barrier should be invisible to a customer and to the threading profile
+// tool (it's neither a terminating barrier nor customer's code, it's
+// used for an internal purpose)
+#if OMPT_SUPPORT
+    // JP: can this barrier potentially leed to task scheduling?
+    // JP: as long as there is a barrier in the implementation, OMPT should and
+    // will provide the barrier events
+    //         so we set-up the necessary frame/return addresses.
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    retval =
+        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
+                      global_tid, FALSE, reduce_size, reduce_data, reduce_func);
+    retval = (retval != 0) ? (0) : (1);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+    // all other workers except master should do this pop here
+    //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
+    if (__kmp_env_consistency_check) {
+      if (retval == 0) {
+        __kmp_pop_sync(global_tid, ct_reduce, loc);
+      }
+    }
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+  if (teams_swapped) {
+    __kmp_restore_swapped_teams(th, team, task_state);
+  }
+  KA_TRACE(
+      10,
+      ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
+       global_tid, packed_reduction_method, retval));
+
+  return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a reduce nowait.
+*/
+void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
+                              kmp_critical_name *lck) {
+
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+  KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
+
+  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
+
+  if (packed_reduction_method == critical_reduce_block) {
+
+    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+    // usage: if team size == 1, no synchronization is required ( on Intel
+    // platforms only )
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+    // neither master nor other workers should get here
+    //     (code gen does not generate this call in case 2: atomic reduce block)
+    // actually it's better to remove this elseif at all;
+    // after removal this value will checked by the 'else' and will assert
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+    // only master gets here
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_reduce, loc);
+
+  KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
+                global_tid, packed_reduction_method));
+
+  return;
+}
+
+/* 2.a.ii. Reduce Block with a terminating barrier */
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two
+operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the master thread, 0 for all other team threads, 2 for all team
+threads if atomic reduction needed
+
+A blocking reduce that includes an implicit barrier.
+*/
+kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
+                        size_t reduce_size, void *reduce_data,
+                        void (*reduce_func)(void *lhs_data, void *rhs_data),
+                        kmp_critical_name *lck) {
+  KMP_COUNT_BLOCK(REDUCE_wait);
+  int retval = 0;
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int teams_swapped = 0, task_state;
+
+  KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
+
+  // why do we need this initialization here at all?
+  // Reduction clause can not be a stand-alone directive.
+
+  // do not call __kmp_serial_initialize(), it will be called by
+  // __kmp_parallel_initialize() if needed
+  // possible detection of false-positive race by the threadchecker ???
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+// check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
+#else
+  if (__kmp_env_consistency_check)
+    __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
+#endif
+
+  th = __kmp_thread_from_gtid(global_tid);
+  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
+
+  packed_reduction_method = __kmp_determine_reduction_method(
+      loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
+  __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
+
+  if (packed_reduction_method == critical_reduce_block) {
+
+    __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
+    retval = 1;
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+    // usage: if team size == 1, no synchronization is required ( Intel
+    // platforms only )
+    retval = 1;
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+    retval = 2;
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+// case tree_reduce_block:
+// this barrier should be visible to a customer and to the threading profile
+// tool (it's a terminating barrier on constructs if NOWAIT not specified)
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident =
+        loc; // needed for correct notification of frames
+#endif
+    retval =
+        __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
+                      global_tid, TRUE, reduce_size, reduce_data, reduce_func);
+    retval = (retval != 0) ? (0) : (1);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+    // all other workers except master should do this pop here
+    // ( none of other workers except master will enter __kmpc_end_reduce() )
+    if (__kmp_env_consistency_check) {
+      if (retval == 0) { // 0: all other workers; 1: master
+        __kmp_pop_sync(global_tid, ct_reduce, loc);
+      }
+    }
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+  if (teams_swapped) {
+    __kmp_restore_swapped_teams(th, team, task_state);
+  }
+
+  KA_TRACE(10,
+           ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
+            global_tid, packed_reduction_method, retval));
+  return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a blocking reduce.
+The <tt>lck</tt> pointer must be the same as that used in the corresponding
+start function.
+*/
+void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
+                       kmp_critical_name *lck) {
+
+  PACKED_REDUCTION_METHOD_T packed_reduction_method;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int teams_swapped = 0, task_state;
+
+  KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
+
+  th = __kmp_thread_from_gtid(global_tid);
+  teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
+
+  packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
+
+  // this barrier should be visible to a customer and to the threading profile
+  // tool (it's a terminating barrier on constructs if NOWAIT not specified)
+
+  if (packed_reduction_method == critical_reduce_block) {
+    __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
+
+// TODO: implicit barrier: should be exposed
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+  } else if (packed_reduction_method == empty_reduce_block) {
+
+// usage: if team size==1, no synchronization is required (Intel platforms only)
+
+// TODO: implicit barrier: should be exposed
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+  } else if (packed_reduction_method == atomic_reduce_block) {
+
+#if OMPT_SUPPORT
+    ompt_frame_t *ompt_frame;
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+      if (ompt_frame->enter_frame.ptr == NULL)
+        ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(global_tid);
+    }
+#endif
+// TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+    __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.enabled) {
+      ompt_frame->enter_frame = ompt_data_none;
+    }
+#endif
+
+  } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
+                                   tree_reduce_block)) {
+
+    // only master executes here (master releases all other workers)
+    __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
+                            global_tid);
+
+  } else {
+
+    // should never reach this block
+    KMP_ASSERT(0); // "unexpected method"
+  }
+  if (teams_swapped) {
+    __kmp_restore_swapped_teams(th, team, task_state);
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_pop_sync(global_tid, ct_reduce, loc);
+
+  KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
+                global_tid, packed_reduction_method));
+
+  return;
+}
+
+#undef __KMP_GET_REDUCTION_METHOD
+#undef __KMP_SET_REDUCTION_METHOD
+
+/* end of interface to fast scalable reduce routines */
+
+kmp_uint64 __kmpc_get_taskid() {
+
+  kmp_int32 gtid;
+  kmp_info_t *thread;
+
+  gtid = __kmp_get_gtid();
+  if (gtid < 0) {
+    return 0;
+  }
+  thread = __kmp_thread_from_gtid(gtid);
+  return thread->th.th_current_task->td_task_id;
+
+} // __kmpc_get_taskid
+
+kmp_uint64 __kmpc_get_parent_taskid() {
+
+  kmp_int32 gtid;
+  kmp_info_t *thread;
+  kmp_taskdata_t *parent_task;
+
+  gtid = __kmp_get_gtid();
+  if (gtid < 0) {
+    return 0;
+  }
+  thread = __kmp_thread_from_gtid(gtid);
+  parent_task = thread->th.th_current_task->td_parent;
+  return (parent_task == NULL ? 0 : parent_task->td_task_id);
+
+} // __kmpc_get_parent_taskid
+
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+@param num_dims  number of associated doacross loops.
+@param dims  info on loops bounds.
+
+Initialize doacross loop information.
+Expect compiler send us inclusive bounds,
+e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
+*/
+void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
+                          const struct kmp_dim *dims) {
+  int j, idx;
+  kmp_int64 last, trace_count;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_uint32 *flags;
+  kmp_disp_t *pr_buf = th->th.th_dispatch;
+  dispatch_shared_info_t *sh_buf;
+
+  KA_TRACE(
+      20,
+      ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
+       gtid, num_dims, !team->t.t_serialized));
+  KMP_DEBUG_ASSERT(dims != NULL);
+  KMP_DEBUG_ASSERT(num_dims > 0);
+
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
+    return; // no dependencies if team is serialized
+  }
+  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
+  // the next loop
+  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
+
+  // Save bounds info into allocated private buffer
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
+  pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
+      th, sizeof(kmp_int64) * (4 * num_dims + 1));
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+  pr_buf->th_doacross_info[0] =
+      (kmp_int64)num_dims; // first element is number of dimensions
+  // Save also address of num_done in order to access it later without knowing
+  // the buffer index
+  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
+  pr_buf->th_doacross_info[2] = dims[0].lo;
+  pr_buf->th_doacross_info[3] = dims[0].up;
+  pr_buf->th_doacross_info[4] = dims[0].st;
+  last = 5;
+  for (j = 1; j < num_dims; ++j) {
+    kmp_int64
+        range_length; // To keep ranges of all dimensions but the first dims[0]
+    if (dims[j].st == 1) { // most common case
+      // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
+      range_length = dims[j].up - dims[j].lo + 1;
+    } else {
+      if (dims[j].st > 0) {
+        KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
+        range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
+      } else { // negative increment
+        KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
+        range_length =
+            (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
+      }
+    }
+    pr_buf->th_doacross_info[last++] = range_length;
+    pr_buf->th_doacross_info[last++] = dims[j].lo;
+    pr_buf->th_doacross_info[last++] = dims[j].up;
+    pr_buf->th_doacross_info[last++] = dims[j].st;
+  }
+
+  // Compute total trip count.
+  // Start with range of dims[0] which we don't need to keep in the buffer.
+  if (dims[0].st == 1) { // most common case
+    trace_count = dims[0].up - dims[0].lo + 1;
+  } else if (dims[0].st > 0) {
+    KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
+    trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
+  } else { // negative increment
+    KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
+    trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
+  }
+  for (j = 1; j < num_dims; ++j) {
+    trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
+  }
+  KMP_DEBUG_ASSERT(trace_count > 0);
+
+  // Check if shared buffer is not occupied by other loop (idx -
+  // __kmp_dispatch_num_buffers)
+  if (idx != sh_buf->doacross_buf_idx) {
+    // Shared buffer is occupied, wait for it to be free
+    __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
+                 __kmp_eq_4, NULL);
+  }
+#if KMP_32_BIT_ARCH
+  // Check if we are the first thread. After the CAS the first thread gets 0,
+  // others get 1 if initialization is in progress, allocated pointer otherwise.
+  // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
+  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
+      (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
+#else
+  flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
+      (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
+#endif
+  if (flags == NULL) {
+    // we are the first thread, allocate the array of flags
+    size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+    flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
+    KMP_MB();
+    sh_buf->doacross_flags = flags;
+  } else if (flags == (kmp_uint32 *)1) {
+#if KMP_32_BIT_ARCH
+    // initialization is still in progress, need to wait
+    while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
+#else
+    while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
+#endif
+      KMP_YIELD(TRUE);
+    KMP_MB();
+  } else {
+    KMP_MB();
+  }
+  KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
+  pr_buf->th_doacross_flags =
+      sh_buf->doacross_flags; // save private copy in order to not
+  // touch shared buffer on each iteration
+  KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
+}
+
+void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
+  kmp_int32 shft, num_dims, i;
+  kmp_uint32 flag;
+  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_disp_t *pr_buf;
+  kmp_int64 lo, up, st;
+
+  KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
+    return; // no dependencies if team is serialized
+  }
+
+  // calculate sequential iteration number and check out-of-bounds condition
+  pr_buf = th->th.th_dispatch;
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+  num_dims = pr_buf->th_doacross_info[0];
+  lo = pr_buf->th_doacross_info[2];
+  up = pr_buf->th_doacross_info[3];
+  st = pr_buf->th_doacross_info[4];
+  if (st == 1) { // most common case
+    if (vec[0] < lo || vec[0] > up) {
+      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                    "bounds [%lld,%lld]\n",
+                    gtid, vec[0], lo, up));
+      return;
+    }
+    iter_number = vec[0] - lo;
+  } else if (st > 0) {
+    if (vec[0] < lo || vec[0] > up) {
+      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                    "bounds [%lld,%lld]\n",
+                    gtid, vec[0], lo, up));
+      return;
+    }
+    iter_number = (kmp_uint64)(vec[0] - lo) / st;
+  } else { // negative increment
+    if (vec[0] > lo || vec[0] < up) {
+      KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                    "bounds [%lld,%lld]\n",
+                    gtid, vec[0], lo, up));
+      return;
+    }
+    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+  }
+  for (i = 1; i < num_dims; ++i) {
+    kmp_int64 iter, ln;
+    kmp_int32 j = i * 4;
+    ln = pr_buf->th_doacross_info[j + 1];
+    lo = pr_buf->th_doacross_info[j + 2];
+    up = pr_buf->th_doacross_info[j + 3];
+    st = pr_buf->th_doacross_info[j + 4];
+    if (st == 1) {
+      if (vec[i] < lo || vec[i] > up) {
+        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                      "bounds [%lld,%lld]\n",
+                      gtid, vec[i], lo, up));
+        return;
+      }
+      iter = vec[i] - lo;
+    } else if (st > 0) {
+      if (vec[i] < lo || vec[i] > up) {
+        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                      "bounds [%lld,%lld]\n",
+                      gtid, vec[i], lo, up));
+        return;
+      }
+      iter = (kmp_uint64)(vec[i] - lo) / st;
+    } else { // st < 0
+      if (vec[i] > lo || vec[i] < up) {
+        KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
+                      "bounds [%lld,%lld]\n",
+                      gtid, vec[i], lo, up));
+        return;
+      }
+      iter = (kmp_uint64)(lo - vec[i]) / (-st);
+    }
+    iter_number = iter + ln * iter_number;
+  }
+  shft = iter_number % 32; // use 32-bit granularity
+  iter_number >>= 5; // divided by 32
+  flag = 1 << shft;
+  while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
+    KMP_YIELD(TRUE);
+  }
+  KMP_MB();
+  KA_TRACE(20,
+           ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
+            gtid, (iter_number << 5) + shft));
+}
+
+void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
+  kmp_int32 shft, num_dims, i;
+  kmp_uint32 flag;
+  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_disp_t *pr_buf;
+  kmp_int64 lo, st;
+
+  KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
+    return; // no dependencies if team is serialized
+  }
+
+  // calculate sequential iteration number (same as in "wait" but no
+  // out-of-bounds checks)
+  pr_buf = th->th.th_dispatch;
+  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+  num_dims = pr_buf->th_doacross_info[0];
+  lo = pr_buf->th_doacross_info[2];
+  st = pr_buf->th_doacross_info[4];
+  if (st == 1) { // most common case
+    iter_number = vec[0] - lo;
+  } else if (st > 0) {
+    iter_number = (kmp_uint64)(vec[0] - lo) / st;
+  } else { // negative increment
+    iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+  }
+  for (i = 1; i < num_dims; ++i) {
+    kmp_int64 iter, ln;
+    kmp_int32 j = i * 4;
+    ln = pr_buf->th_doacross_info[j + 1];
+    lo = pr_buf->th_doacross_info[j + 2];
+    st = pr_buf->th_doacross_info[j + 4];
+    if (st == 1) {
+      iter = vec[i] - lo;
+    } else if (st > 0) {
+      iter = (kmp_uint64)(vec[i] - lo) / st;
+    } else { // st < 0
+      iter = (kmp_uint64)(lo - vec[i]) / (-st);
+    }
+    iter_number = iter + ln * iter_number;
+  }
+  shft = iter_number % 32; // use 32-bit granularity
+  iter_number >>= 5; // divided by 32
+  flag = 1 << shft;
+  KMP_MB();
+  if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
+    KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
+  KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
+                (iter_number << 5) + shft));
+}
+
+void __kmpc_doacross_fini(ident_t *loc, int gtid) {
+  kmp_int32 num_done;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+  kmp_disp_t *pr_buf = th->th.th_dispatch;
+
+  KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
+  if (team->t.t_serialized) {
+    KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
+    return; // nothing to do
+  }
+  num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
+  if (num_done == th->th.th_team_nproc) {
+    // we are the last thread, need to free shared resources
+    int idx = pr_buf->th_doacross_buf_idx - 1;
+    dispatch_shared_info_t *sh_buf =
+        &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
+                     (kmp_int64)&sh_buf->doacross_num_done);
+    KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
+    KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
+    __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
+    sh_buf->doacross_flags = NULL;
+    sh_buf->doacross_num_done = 0;
+    sh_buf->doacross_buf_idx +=
+        __kmp_dispatch_num_buffers; // free buffer for future re-use
+  }
+  // free private resources (need to keep buffer index forever)
+  pr_buf->th_doacross_flags = NULL;
+  __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
+  pr_buf->th_doacross_info = NULL;
+  KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
+}
+
+/* omp_alloc/omp_free only defined for C/C++, not for Fortran */
+void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
+  return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
+}
+
+void omp_free(void *ptr, omp_allocator_handle_t allocator) {
+  __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
+}
+
+int __kmpc_get_target_offload(void) {
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_target_offload;
+}
+
+int __kmpc_pause_resource(kmp_pause_status_t level) {
+  if (!__kmp_init_serial) {
+    return 1; // Can't pause if runtime is not initialized
+  }
+  return __kmp_pause_resource(level);
+}
diff --git a/final/runtime/src/kmp_debug.cpp b/final/runtime/src/kmp_debug.cpp
new file mode 100644
index 0000000..6c397c5
--- /dev/null
+++ b/final/runtime/src/kmp_debug.cpp
@@ -0,0 +1,131 @@
+/*
+ * kmp_debug.cpp -- debug utilities for the Guide library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_debug.h" /* really necessary? */
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+
+#ifdef KMP_DEBUG
+void __kmp_debug_printf_stdout(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_vprintf(kmp_out, format, ap);
+
+  va_end(ap);
+}
+#endif
+
+void __kmp_debug_printf(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_vprintf(kmp_err, format, ap);
+
+  va_end(ap);
+}
+
+#ifdef KMP_USE_ASSERT
+int __kmp_debug_assert(char const *msg, char const *file, int line) {
+
+  if (file == NULL) {
+    file = KMP_I18N_STR(UnknownFile);
+  } else {
+    // Remove directories from path, leave only file name. File name is enough,
+    // there is no need in bothering developers and customers with full paths.
+    char const *slash = strrchr(file, '/');
+    if (slash != NULL) {
+      file = slash + 1;
+    }
+  }
+
+#ifdef KMP_DEBUG
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_debug_printf("Assertion failure at %s(%d): %s.\n", file, line, msg);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+#ifdef USE_ASSERT_BREAK
+#if KMP_OS_WINDOWS
+  DebugBreak();
+#endif
+#endif // USE_ASSERT_BREAK
+#ifdef USE_ASSERT_STALL
+  /*    __kmp_infinite_loop(); */
+  for (;;)
+    ;
+#endif // USE_ASSERT_STALL
+#ifdef USE_ASSERT_SEG
+  {
+    int volatile *ZERO = (int *)0;
+    ++(*ZERO);
+  }
+#endif // USE_ASSERT_SEG
+#endif
+
+  __kmp_fatal(KMP_MSG(AssertionFailure, file, line), KMP_HNT(SubmitBugReport),
+              __kmp_msg_null);
+
+  return 0;
+
+} // __kmp_debug_assert
+
+#endif // KMP_USE_ASSERT
+
+/* Dump debugging buffer to stderr */
+void __kmp_dump_debug_buffer(void) {
+  if (__kmp_debug_buffer != NULL) {
+    int i;
+    int dc = __kmp_debug_count;
+    char *db = &__kmp_debug_buffer[(dc % __kmp_debug_buf_lines) *
+                                   __kmp_debug_buf_chars];
+    char *db_end =
+        &__kmp_debug_buffer[__kmp_debug_buf_lines * __kmp_debug_buf_chars];
+    char *db2;
+
+    __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+    __kmp_printf_no_lock("\nStart dump of debugging buffer (entry=%d):\n",
+                         dc % __kmp_debug_buf_lines);
+
+    for (i = 0; i < __kmp_debug_buf_lines; i++) {
+
+      if (*db != '\0') {
+        /* Fix up where no carriage return before string termination char */
+        for (db2 = db + 1; db2 < db + __kmp_debug_buf_chars - 1; db2++) {
+          if (*db2 == '\0') {
+            if (*(db2 - 1) != '\n') {
+              *db2 = '\n';
+              *(db2 + 1) = '\0';
+            }
+            break;
+          }
+        }
+        /* Handle case at end by shortening the printed message by one char if
+         * necessary */
+        if (db2 == db + __kmp_debug_buf_chars - 1 && *db2 == '\0' &&
+            *(db2 - 1) != '\n') {
+          *(db2 - 1) = '\n';
+        }
+
+        __kmp_printf_no_lock("%4d: %.*s", i, __kmp_debug_buf_chars, db);
+        *db = '\0'; /* only let it print once! */
+      }
+
+      db += __kmp_debug_buf_chars;
+      if (db >= db_end)
+        db = __kmp_debug_buffer;
+    }
+
+    __kmp_printf_no_lock("End dump of debugging buffer (entry=%d).\n\n",
+                         (dc + i - 1) % __kmp_debug_buf_lines);
+    __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+  }
+}
diff --git a/final/runtime/src/kmp_debug.h b/final/runtime/src/kmp_debug.h
new file mode 100644
index 0000000..08d52cc
--- /dev/null
+++ b/final/runtime/src/kmp_debug.h
@@ -0,0 +1,179 @@
+/*
+ * kmp_debug.h -- debug / assertion code for Assure library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DEBUG_H
+#define KMP_DEBUG_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// -----------------------------------------------------------------------------
+// Build-time assertion.
+
+// New C++11 style build assert
+#define KMP_BUILD_ASSERT(expr) static_assert(expr, "Build condition error")
+
+// -----------------------------------------------------------------------------
+// Run-time assertions.
+
+extern void __kmp_dump_debug_buffer(void);
+
+#ifdef KMP_USE_ASSERT
+extern int __kmp_debug_assert(char const *expr, char const *file, int line);
+#ifdef KMP_DEBUG
+#define KMP_ASSERT(cond)                                                       \
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert(#cond, __FILE__, __LINE__);                             \
+  }
+#define KMP_ASSERT2(cond, msg)                                                 \
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert((msg), __FILE__, __LINE__);                             \
+  }
+#define KMP_DEBUG_ASSERT(cond) KMP_ASSERT(cond)
+#define KMP_DEBUG_ASSERT2(cond, msg) KMP_ASSERT2(cond, msg)
+#define KMP_DEBUG_USE_VAR(x) /* Nothing (it is used!) */
+#else
+// Do not expose condition in release build. Use "assertion failure".
+#define KMP_ASSERT(cond)                                                       \
+  if (!(cond)) {                                                               \
+    __kmp_debug_assert("assertion failure", __FILE__, __LINE__);               \
+  }
+#define KMP_ASSERT2(cond, msg) KMP_ASSERT(cond)
+#define KMP_DEBUG_ASSERT(cond) /* Nothing */
+#define KMP_DEBUG_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_USE_VAR(x) ((void)(x))
+#endif // KMP_DEBUG
+#else
+#define KMP_ASSERT(cond) /* Nothing */
+#define KMP_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_ASSERT(cond) /* Nothing */
+#define KMP_DEBUG_ASSERT2(cond, msg) /* Nothing */
+#define KMP_DEBUG_USE_VAR(x) ((void)(x))
+#endif // KMP_USE_ASSERT
+
+#ifdef KMP_DEBUG
+extern void __kmp_debug_printf_stdout(char const *format, ...);
+#endif
+extern void __kmp_debug_printf(char const *format, ...);
+
+#ifdef KMP_DEBUG
+
+extern int kmp_a_debug;
+extern int kmp_b_debug;
+extern int kmp_c_debug;
+extern int kmp_d_debug;
+extern int kmp_e_debug;
+extern int kmp_f_debug;
+extern int kmp_diag;
+
+#define KA_TRACE(d, x)                                                         \
+  if (kmp_a_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KB_TRACE(d, x)                                                         \
+  if (kmp_b_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KC_TRACE(d, x)                                                         \
+  if (kmp_c_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KD_TRACE(d, x)                                                         \
+  if (kmp_d_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KE_TRACE(d, x)                                                         \
+  if (kmp_e_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define KF_TRACE(d, x)                                                         \
+  if (kmp_f_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#define K_DIAG(d, x)                                                           \
+  {                                                                            \
+    if (kmp_diag == d) {                                                       \
+      __kmp_debug_printf_stdout x;                                             \
+    }                                                                          \
+  }
+
+#define KA_DUMP(d, x)                                                          \
+  if (kmp_a_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KB_DUMP(d, x)                                                          \
+  if (kmp_b_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KC_DUMP(d, x)                                                          \
+  if (kmp_c_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KD_DUMP(d, x)                                                          \
+  if (kmp_d_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KE_DUMP(d, x)                                                          \
+  if (kmp_e_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#define KF_DUMP(d, x)                                                          \
+  if (kmp_f_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+
+#else
+
+#define KA_TRACE(d, x) /* nothing to do */
+#define KB_TRACE(d, x) /* nothing to do */
+#define KC_TRACE(d, x) /* nothing to do */
+#define KD_TRACE(d, x) /* nothing to do */
+#define KE_TRACE(d, x) /* nothing to do */
+#define KF_TRACE(d, x) /* nothing to do */
+#define K_DIAG(d, x)                                                           \
+  {} /* nothing to do */
+
+#define KA_DUMP(d, x) /* nothing to do */
+#define KB_DUMP(d, x) /* nothing to do */
+#define KC_DUMP(d, x) /* nothing to do */
+#define KD_DUMP(d, x) /* nothing to do */
+#define KE_DUMP(d, x) /* nothing to do */
+#define KF_DUMP(d, x) /* nothing to do */
+
+#endif // KMP_DEBUG
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_DEBUG_H */
diff --git a/final/runtime/src/kmp_debugger.cpp b/final/runtime/src/kmp_debugger.cpp
new file mode 100644
index 0000000..490300f
--- /dev/null
+++ b/final/runtime/src/kmp_debugger.cpp
@@ -0,0 +1,286 @@
+#include "kmp_config.h"
+
+#if USE_DEBUGGER
+/*
+ * kmp_debugger.cpp -- debugger support.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_lock.h"
+#include "kmp_omp.h"
+#include "kmp_str.h"
+
+// NOTE: All variable names are known to the debugger, do not change!
+
+#ifdef __cplusplus
+extern "C" {
+extern kmp_omp_struct_info_t __kmp_omp_debug_struct_info;
+} // extern "C"
+#endif // __cplusplus
+
+int __kmp_debugging = FALSE; // Boolean whether currently debugging OpenMP RTL.
+
+#define offset_and_size_of(structure, field)                                   \
+  { offsetof(structure, field), sizeof(((structure *)NULL)->field) }
+
+#define offset_and_size_not_available                                          \
+  { -1, -1 }
+
+#define addr_and_size_of(var)                                                  \
+  { (kmp_uint64)(&var), sizeof(var) }
+
+#define nthr_buffer_size 1024
+static kmp_int32 kmp_omp_nthr_info_buffer[nthr_buffer_size] = {
+    nthr_buffer_size * sizeof(kmp_int32)};
+
+/* TODO: Check punctuation for various platforms here */
+static char func_microtask[] = "__kmp_invoke_microtask";
+static char func_fork[] = "__kmpc_fork_call";
+static char func_fork_teams[] = "__kmpc_fork_teams";
+
+// Various info about runtime structures: addresses, field offsets, sizes, etc.
+kmp_omp_struct_info_t __kmp_omp_debug_struct_info = {
+
+    /* Change this only if you make a fundamental data structure change here */
+    KMP_OMP_VERSION,
+
+    /* sanity check.  Only should be checked if versions are identical
+     * This is also used for backward compatibility to get the runtime
+     * structure size if it the runtime is older than the interface */
+    sizeof(kmp_omp_struct_info_t),
+
+    /* OpenMP RTL version info. */
+    addr_and_size_of(__kmp_version_major),
+    addr_and_size_of(__kmp_version_minor),
+    addr_and_size_of(__kmp_version_build),
+    addr_and_size_of(__kmp_openmp_version),
+    {(kmp_uint64)(__kmp_copyright) + KMP_VERSION_MAGIC_LEN,
+     0}, // Skip magic prefix.
+
+    /* Various globals. */
+    addr_and_size_of(__kmp_threads),
+    addr_and_size_of(__kmp_root),
+    addr_and_size_of(__kmp_threads_capacity),
+#if KMP_USE_MONITOR
+    addr_and_size_of(__kmp_monitor),
+#endif
+#if !KMP_USE_DYNAMIC_LOCK
+    addr_and_size_of(__kmp_user_lock_table),
+#endif
+    addr_and_size_of(func_microtask),
+    addr_and_size_of(func_fork),
+    addr_and_size_of(func_fork_teams),
+    addr_and_size_of(__kmp_team_counter),
+    addr_and_size_of(__kmp_task_counter),
+    addr_and_size_of(kmp_omp_nthr_info_buffer),
+    sizeof(void *),
+    OMP_LOCK_T_SIZE < sizeof(void *),
+    bs_last_barrier,
+    INITIAL_TASK_DEQUE_SIZE,
+
+    // thread structure information
+    sizeof(kmp_base_info_t),
+    offset_and_size_of(kmp_base_info_t, th_info),
+    offset_and_size_of(kmp_base_info_t, th_team),
+    offset_and_size_of(kmp_base_info_t, th_root),
+    offset_and_size_of(kmp_base_info_t, th_serial_team),
+    offset_and_size_of(kmp_base_info_t, th_ident),
+    offset_and_size_of(kmp_base_info_t, th_spin_here),
+    offset_and_size_of(kmp_base_info_t, th_next_waiting),
+    offset_and_size_of(kmp_base_info_t, th_task_team),
+    offset_and_size_of(kmp_base_info_t, th_current_task),
+    offset_and_size_of(kmp_base_info_t, th_task_state),
+    offset_and_size_of(kmp_base_info_t, th_bar),
+    offset_and_size_of(kmp_bstate_t, b_worker_arrived),
+
+    // teams information
+    offset_and_size_of(kmp_base_info_t, th_teams_microtask),
+    offset_and_size_of(kmp_base_info_t, th_teams_level),
+    offset_and_size_of(kmp_teams_size_t, nteams),
+    offset_and_size_of(kmp_teams_size_t, nth),
+
+    // kmp_desc structure (for info field above)
+    sizeof(kmp_desc_base_t),
+    offset_and_size_of(kmp_desc_base_t, ds_tid),
+    offset_and_size_of(kmp_desc_base_t, ds_gtid),
+// On Windows* OS, ds_thread contains a thread /handle/, which is not usable,
+// while thread /id/ is in ds_thread_id.
+#if KMP_OS_WINDOWS
+    offset_and_size_of(kmp_desc_base_t, ds_thread_id),
+#else
+    offset_and_size_of(kmp_desc_base_t, ds_thread),
+#endif
+
+    // team structure information
+    sizeof(kmp_base_team_t),
+    offset_and_size_of(kmp_base_team_t, t_master_tid),
+    offset_and_size_of(kmp_base_team_t, t_ident),
+    offset_and_size_of(kmp_base_team_t, t_parent),
+    offset_and_size_of(kmp_base_team_t, t_nproc),
+    offset_and_size_of(kmp_base_team_t, t_threads),
+    offset_and_size_of(kmp_base_team_t, t_serialized),
+    offset_and_size_of(kmp_base_team_t, t_id),
+    offset_and_size_of(kmp_base_team_t, t_pkfn),
+    offset_and_size_of(kmp_base_team_t, t_task_team),
+    offset_and_size_of(kmp_base_team_t, t_implicit_task_taskdata),
+    offset_and_size_of(kmp_base_team_t, t_cancel_request),
+    offset_and_size_of(kmp_base_team_t, t_bar),
+    offset_and_size_of(kmp_balign_team_t, b_master_arrived),
+    offset_and_size_of(kmp_balign_team_t, b_team_arrived),
+
+    // root structure information
+    sizeof(kmp_base_root_t),
+    offset_and_size_of(kmp_base_root_t, r_root_team),
+    offset_and_size_of(kmp_base_root_t, r_hot_team),
+    offset_and_size_of(kmp_base_root_t, r_uber_thread),
+    offset_and_size_not_available,
+
+    // ident structure information
+    sizeof(ident_t),
+    offset_and_size_of(ident_t, psource),
+    offset_and_size_of(ident_t, flags),
+
+    // lock structure information
+    sizeof(kmp_base_queuing_lock_t),
+    offset_and_size_of(kmp_base_queuing_lock_t, initialized),
+    offset_and_size_of(kmp_base_queuing_lock_t, location),
+    offset_and_size_of(kmp_base_queuing_lock_t, tail_id),
+    offset_and_size_of(kmp_base_queuing_lock_t, head_id),
+    offset_and_size_of(kmp_base_queuing_lock_t, next_ticket),
+    offset_and_size_of(kmp_base_queuing_lock_t, now_serving),
+    offset_and_size_of(kmp_base_queuing_lock_t, owner_id),
+    offset_and_size_of(kmp_base_queuing_lock_t, depth_locked),
+    offset_and_size_of(kmp_base_queuing_lock_t, flags),
+
+#if !KMP_USE_DYNAMIC_LOCK
+    /* Lock table. */
+    sizeof(kmp_lock_table_t),
+    offset_and_size_of(kmp_lock_table_t, used),
+    offset_and_size_of(kmp_lock_table_t, allocated),
+    offset_and_size_of(kmp_lock_table_t, table),
+#endif
+
+    // Task team structure information.
+    sizeof(kmp_base_task_team_t),
+    offset_and_size_of(kmp_base_task_team_t, tt_threads_data),
+    offset_and_size_of(kmp_base_task_team_t, tt_found_tasks),
+    offset_and_size_of(kmp_base_task_team_t, tt_nproc),
+    offset_and_size_of(kmp_base_task_team_t, tt_unfinished_threads),
+    offset_and_size_of(kmp_base_task_team_t, tt_active),
+
+    // task_data_t.
+    sizeof(kmp_taskdata_t),
+    offset_and_size_of(kmp_taskdata_t, td_task_id),
+    offset_and_size_of(kmp_taskdata_t, td_flags),
+    offset_and_size_of(kmp_taskdata_t, td_team),
+    offset_and_size_of(kmp_taskdata_t, td_parent),
+    offset_and_size_of(kmp_taskdata_t, td_level),
+    offset_and_size_of(kmp_taskdata_t, td_ident),
+    offset_and_size_of(kmp_taskdata_t, td_allocated_child_tasks),
+    offset_and_size_of(kmp_taskdata_t, td_incomplete_child_tasks),
+
+    offset_and_size_of(kmp_taskdata_t, td_taskwait_ident),
+    offset_and_size_of(kmp_taskdata_t, td_taskwait_counter),
+    offset_and_size_of(kmp_taskdata_t, td_taskwait_thread),
+
+    offset_and_size_of(kmp_taskdata_t, td_taskgroup),
+    offset_and_size_of(kmp_taskgroup_t, count),
+    offset_and_size_of(kmp_taskgroup_t, cancel_request),
+
+    offset_and_size_of(kmp_taskdata_t, td_depnode),
+    offset_and_size_of(kmp_depnode_list_t, node),
+    offset_and_size_of(kmp_depnode_list_t, next),
+    offset_and_size_of(kmp_base_depnode_t, successors),
+    offset_and_size_of(kmp_base_depnode_t, task),
+    offset_and_size_of(kmp_base_depnode_t, npredecessors),
+    offset_and_size_of(kmp_base_depnode_t, nrefs),
+    offset_and_size_of(kmp_task_t, routine),
+
+    // thread_data_t.
+    sizeof(kmp_thread_data_t),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_size),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_head),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_tail),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_ntasks),
+    offset_and_size_of(kmp_base_thread_data_t, td_deque_last_stolen),
+
+    // The last field.
+    KMP_OMP_VERSION,
+
+}; // __kmp_omp_debug_struct_info
+
+#undef offset_and_size_of
+#undef addr_and_size_of
+
+/* Intel compiler on IA-32 architecture issues a warning "conversion
+  from "unsigned long long" to "char *" may lose significant bits"
+  when 64-bit value is assigned to 32-bit pointer. Use this function
+  to suppress the warning. */
+static inline void *__kmp_convert_to_ptr(kmp_uint64 addr) {
+#if KMP_COMPILER_ICC
+#pragma warning(push)
+#pragma warning(disable : 810) // conversion from "unsigned long long" to "char
+// *" may lose significant bits
+#pragma warning(disable : 1195) // conversion from integer to smaller pointer
+#endif // KMP_COMPILER_ICC
+  return (void *)addr;
+#if KMP_COMPILER_ICC
+#pragma warning(pop)
+#endif // KMP_COMPILER_ICC
+} // __kmp_convert_to_ptr
+
+static int kmp_location_match(kmp_str_loc_t *loc, kmp_omp_nthr_item_t *item) {
+
+  int file_match = 0;
+  int func_match = 0;
+  int line_match = 0;
+
+  char *file = (char *)__kmp_convert_to_ptr(item->file);
+  char *func = (char *)__kmp_convert_to_ptr(item->func);
+  file_match = __kmp_str_fname_match(&loc->fname, file);
+  func_match =
+      item->func == 0 // If item->func is NULL, it allows any func name.
+      || strcmp(func, "*") == 0 ||
+      (loc->func != NULL && strcmp(loc->func, func) == 0);
+  line_match =
+      item->begin <= loc->line &&
+      (item->end <= 0 ||
+       loc->line <= item->end); // if item->end <= 0, it means "end of file".
+
+  return (file_match && func_match && line_match);
+
+} // kmp_location_match
+
+int __kmp_omp_num_threads(ident_t const *ident) {
+
+  int num_threads = 0;
+
+  kmp_omp_nthr_info_t *info = (kmp_omp_nthr_info_t *)__kmp_convert_to_ptr(
+      __kmp_omp_debug_struct_info.nthr_info.addr);
+  if (info->num > 0 && info->array != 0) {
+    kmp_omp_nthr_item_t *items =
+        (kmp_omp_nthr_item_t *)__kmp_convert_to_ptr(info->array);
+    kmp_str_loc_t loc = __kmp_str_loc_init(ident->psource, 1);
+    int i;
+    for (i = 0; i < info->num; ++i) {
+      if (kmp_location_match(&loc, &items[i])) {
+        num_threads = items[i].num_threads;
+      }
+    }
+    __kmp_str_loc_free(&loc);
+  }
+
+  return num_threads;
+  ;
+
+} // __kmp_omp_num_threads
+#endif /* USE_DEBUGGER */
diff --git a/final/runtime/src/kmp_debugger.h b/final/runtime/src/kmp_debugger.h
new file mode 100644
index 0000000..7ec7428
--- /dev/null
+++ b/final/runtime/src/kmp_debugger.h
@@ -0,0 +1,48 @@
+#if USE_DEBUGGER
+/*
+ * kmp_debugger.h -- debugger support.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DEBUGGER_H
+#define KMP_DEBUGGER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/* This external variable can be set by any debugger to flag to the runtime
+   that we are currently executing inside a debugger.  This will allow the
+   debugger to override the number of threads spawned in a parallel region by
+   using __kmp_omp_num_threads() (below).
+   * When __kmp_debugging is TRUE, each team and each task gets a unique integer
+   identifier that can be used by debugger to conveniently identify teams and
+   tasks.
+   * The debugger has access to __kmp_omp_debug_struct_info which contains
+   information about the OpenMP library's important internal structures.  This
+   access will allow the debugger to read detailed information from the typical
+   OpenMP constructs (teams, threads, tasking, etc. ) during a debugging
+   session and offer detailed and useful information which the user can probe
+   about the OpenMP portion of their code. */
+extern int __kmp_debugging; /* Boolean whether currently debugging OpenMP RTL */
+// Return number of threads specified by the debugger for given parallel region.
+/* The ident field, which represents a source file location, is used to check if
+   the debugger has changed the number of threads for the parallel region at
+   source file location ident.  This way, specific parallel regions' number of
+   threads can be changed at the debugger's request. */
+int __kmp_omp_num_threads(ident_t const *ident);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_DEBUGGER_H
+
+#endif // USE_DEBUGGER
diff --git a/final/runtime/src/kmp_dispatch.cpp b/final/runtime/src/kmp_dispatch.cpp
new file mode 100644
index 0000000..161a2c6
--- /dev/null
+++ b/final/runtime/src/kmp_dispatch.cpp
@@ -0,0 +1,2619 @@
+/*
+ * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* Dynamic scheduling initialization and dispatch.
+ *
+ * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
+ *       it may change values between parallel regions.  __kmp_max_nth
+ *       is the largest value __kmp_nth may take, 1 is the smallest.
+ */
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#if KMP_USE_X87CONTROL
+#include <float.h>
+#endif
+#include "kmp_lock.h"
+#include "kmp_dispatch.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(gtid_ref);
+
+  if (__kmp_env_consistency_check) {
+    th = __kmp_threads[*gtid_ref];
+    if (th->th.th_root->r.r_active &&
+        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
+#endif
+    }
+  }
+}
+
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  kmp_info_t *th;
+
+  if (__kmp_env_consistency_check) {
+    th = __kmp_threads[*gtid_ref];
+    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
+      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
+    }
+  }
+}
+
+// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
+static inline int __kmp_get_monotonicity(enum sched_type schedule,
+                                         bool use_hier = false) {
+  // Pick up the nonmonotonic/monotonic bits from the scheduling type
+  int monotonicity;
+  // default to monotonic
+  monotonicity = SCHEDULE_MONOTONIC;
+  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
+    monotonicity = SCHEDULE_NONMONOTONIC;
+  else if (SCHEDULE_HAS_MONOTONIC(schedule))
+    monotonicity = SCHEDULE_MONOTONIC;
+  return monotonicity;
+}
+
+// Initialize a dispatch_private_info_template<T> buffer for a particular
+// type of schedule,chunk.  The loop description is found in lb (lower bound),
+// ub (upper bound), and st (stride).  nproc is the number of threads relevant
+// to the scheduling (often the number of threads in a team, but not always if
+// hierarchical scheduling is used).  tid is the id of the thread calling
+// the function within the group of nproc threads.  It will have a value
+// between 0 and nproc - 1.  This is often just the thread id within a team, but
+// is not necessarily the case when using hierarchical scheduling.
+// loc is the source file location of the corresponding loop
+// gtid is the global thread id
+template <typename T>
+void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                   dispatch_private_info_template<T> *pr,
+                                   enum sched_type schedule, T lb, T ub,
+                                   typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                   kmp_uint64 *cur_chunk,
+#endif
+                                   typename traits_t<T>::signed_t chunk,
+                                   T nproc, T tid) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::floating_t DBL;
+
+  int active;
+  T tc;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  int monotonicity;
+  bool use_hier;
+
+#ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
+                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
+                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<ST>::spec,
+                            traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
+    __kmp_str_free(&buff);
+  }
+#endif
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+
+#if USE_ITT_BUILD
+  int itt_need_metadata_reporting =
+      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
+      team->t.t_active_level == 1;
+#endif
+
+#if KMP_USE_HIER_SCHED
+  use_hier = pr->flags.use_hier;
+#else
+  use_hier = false;
+#endif
+
+  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
+  monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+  /* Pick up the nomerge/ordered bits from the scheduling type */
+  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
+    pr->flags.nomerge = TRUE;
+    schedule =
+        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
+  } else {
+    pr->flags.nomerge = FALSE;
+  }
+  pr->type_size = traits_t<T>::type_size; // remember the size of variables
+  if (kmp_ord_lower & schedule) {
+    pr->flags.ordered = TRUE;
+    schedule =
+        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
+  } else {
+    pr->flags.ordered = FALSE;
+  }
+  // Ordered overrides nonmonotonic
+  if (pr->flags.ordered) {
+    monotonicity = SCHEDULE_MONOTONIC;
+  }
+
+  if (schedule == kmp_sch_static) {
+    schedule = __kmp_static;
+  } else {
+    if (schedule == kmp_sch_runtime) {
+      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
+      // not specified)
+      schedule = team->t.t_sched.r_sched_type;
+      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+      // Detail the schedule if needed (global controls are differentiated
+      // appropriately)
+      if (schedule == kmp_sch_guided_chunked) {
+        schedule = __kmp_guided;
+      } else if (schedule == kmp_sch_static) {
+        schedule = __kmp_static;
+      }
+      // Use the chunk size specified by OMP_SCHEDULE (or default if not
+      // specified)
+      chunk = team->t.t_sched.chunk;
+#if USE_ITT_BUILD
+      if (cur_chunk)
+        *cur_chunk = chunk;
+#endif
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
+                                "schedule:%%d chunk:%%%s\n",
+                                traits_t<ST>::spec);
+        KD_TRACE(10, (buff, gtid, schedule, chunk));
+        __kmp_str_free(&buff);
+      }
+#endif
+    } else {
+      if (schedule == kmp_sch_guided_chunked) {
+        schedule = __kmp_guided;
+      }
+      if (chunk <= 0) {
+        chunk = KMP_DEFAULT_CHUNK;
+      }
+    }
+
+    if (schedule == kmp_sch_auto) {
+      // mapping and differentiation: in the __kmp_do_serial_initialize()
+      schedule = __kmp_auto;
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
+            "schedule:%%d chunk:%%%s\n",
+            traits_t<ST>::spec);
+        KD_TRACE(10, (buff, gtid, schedule, chunk));
+        __kmp_str_free(&buff);
+      }
+#endif
+    }
+#if KMP_STATIC_STEAL_ENABLED
+    // map nonmonotonic:dynamic to static steal
+    if (schedule == kmp_sch_dynamic_chunked) {
+      if (monotonicity == SCHEDULE_NONMONOTONIC)
+        schedule = kmp_sch_static_steal;
+    }
+#endif
+    /* guided analytical not safe for too many threads */
+    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
+      schedule = kmp_sch_guided_iterative_chunked;
+      KMP_WARNING(DispatchManyThreads);
+    }
+    if (schedule == kmp_sch_runtime_simd) {
+      // compiler provides simd_width in the chunk parameter
+      schedule = team->t.t_sched.r_sched_type;
+      monotonicity = __kmp_get_monotonicity(schedule, use_hier);
+      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+      // Detail the schedule if needed (global controls are differentiated
+      // appropriately)
+      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
+          schedule == __kmp_static) {
+        schedule = kmp_sch_static_balanced_chunked;
+      } else {
+        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
+          schedule = kmp_sch_guided_simd;
+        }
+        chunk = team->t.t_sched.chunk * chunk;
+      }
+#if USE_ITT_BUILD
+      if (cur_chunk)
+        *cur_chunk = chunk;
+#endif
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
+            " chunk:%%%s\n",
+            traits_t<ST>::spec);
+        KD_TRACE(10, (buff, gtid, schedule, chunk));
+        __kmp_str_free(&buff);
+      }
+#endif
+    }
+    pr->u.p.parm1 = chunk;
+  }
+  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
+              "unknown scheduling type");
+
+  pr->u.p.count = 0;
+
+  if (__kmp_env_consistency_check) {
+    if (st == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
+                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
+    }
+  }
+  // compute trip count
+  if (st == 1) { // most common case
+    if (ub >= lb) {
+      tc = ub - lb + 1;
+    } else { // ub < lb
+      tc = 0; // zero-trip
+    }
+  } else if (st < 0) {
+    if (lb >= ub) {
+      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
+      // where the division needs to be unsigned regardless of the result type
+      tc = (UT)(lb - ub) / (-st) + 1;
+    } else { // lb < ub
+      tc = 0; // zero-trip
+    }
+  } else { // st > 0
+    if (ub >= lb) {
+      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
+      // where the division needs to be unsigned regardless of the result type
+      tc = (UT)(ub - lb) / st + 1;
+    } else { // ub < lb
+      tc = 0; // zero-trip
+    }
+  }
+
+#if KMP_STATS_ENABLED
+  if (KMP_MASTER_GTID(gtid)) {
+    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
+  }
+#endif
+
+  pr->u.p.lb = lb;
+  pr->u.p.ub = ub;
+  pr->u.p.st = st;
+  pr->u.p.tc = tc;
+
+#if KMP_OS_WINDOWS
+  pr->u.p.last_upper = ub + st;
+#endif /* KMP_OS_WINDOWS */
+
+  /* NOTE: only the active parallel region(s) has active ordered sections */
+
+  if (active) {
+    if (pr->flags.ordered) {
+      pr->ordered_bumped = 0;
+      pr->u.p.ordered_lower = 1;
+      pr->u.p.ordered_upper = 0;
+    }
+  }
+
+  switch (schedule) {
+#if (KMP_STATIC_STEAL_ENABLED)
+  case kmp_sch_static_steal: {
+    T ntc, init;
+
+    KD_TRACE(100,
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
+
+    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
+    if (nproc > 1 && ntc >= nproc) {
+      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
+      T id = tid;
+      T small_chunk, extras;
+
+      small_chunk = ntc / nproc;
+      extras = ntc % nproc;
+
+      init = id * small_chunk + (id < extras ? id : extras);
+      pr->u.p.count = init;
+      pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+
+      pr->u.p.parm2 = lb;
+      // parm3 is the number of times to attempt stealing which is
+      // proportional to the number of chunks per thread up until
+      // the maximum value of nproc.
+      pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
+      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
+      pr->u.p.st = st;
+      if (traits_t<T>::type_size > 4) {
+        // AC: TODO: check if 16-byte CAS available and use it to
+        // improve performance (probably wait for explicit request
+        // before spending time on this).
+        // For now use dynamically allocated per-thread lock,
+        // free memory in __kmp_dispatch_next when status==0.
+        KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
+        th->th.th_dispatch->th_steal_lock =
+            (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+        __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
+      }
+      break;
+    } else {
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
+                     "kmp_sch_static_balanced\n",
+                     gtid));
+      schedule = kmp_sch_static_balanced;
+      /* too few iterations: fall-through to kmp_sch_static_balanced */
+    } // if
+    /* FALL-THROUGH to static balanced */
+    KMP_FALLTHROUGH();
+  } // case
+#endif
+  case kmp_sch_static_balanced: {
+    T init, limit;
+
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
+
+    if (nproc > 1) {
+      T id = tid;
+
+      if (tc < nproc) {
+        if (id < tc) {
+          init = id;
+          limit = id;
+          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
+        } else {
+          pr->u.p.count = 1; /* means no more chunks to execute */
+          pr->u.p.parm1 = FALSE;
+          break;
+        }
+      } else {
+        T small_chunk = tc / nproc;
+        T extras = tc % nproc;
+        init = id * small_chunk + (id < extras ? id : extras);
+        limit = init + small_chunk - (id < extras ? 0 : 1);
+        pr->u.p.parm1 = (id == nproc - 1);
+      }
+    } else {
+      if (tc > 0) {
+        init = 0;
+        limit = tc - 1;
+        pr->u.p.parm1 = TRUE;
+      } else {
+        // zero trip count
+        pr->u.p.count = 1; /* means no more chunks to execute */
+        pr->u.p.parm1 = FALSE;
+        break;
+      }
+    }
+#if USE_ITT_BUILD
+    // Calculate chunk for metadata report
+    if (itt_need_metadata_reporting)
+      if (cur_chunk)
+        *cur_chunk = limit - init + 1;
+#endif
+    if (st == 1) {
+      pr->u.p.lb = lb + init;
+      pr->u.p.ub = lb + limit;
+    } else {
+      // calculated upper bound, "ub" is user-defined upper bound
+      T ub_tmp = lb + limit * st;
+      pr->u.p.lb = lb + init * st;
+      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
+      // it exactly
+      if (st > 0) {
+        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
+      } else {
+        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
+      }
+    }
+    if (pr->flags.ordered) {
+      pr->u.p.ordered_lower = init;
+      pr->u.p.ordered_upper = limit;
+    }
+    break;
+  } // case
+  case kmp_sch_static_balanced_chunked: {
+    // similar to balanced, but chunk adjusted to multiple of simd width
+    T nth = nproc;
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
+                   " -> falling-through to static_greedy\n",
+                   gtid));
+    schedule = kmp_sch_static_greedy;
+    if (nth > 1)
+      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
+    else
+      pr->u.p.parm1 = tc;
+    break;
+  } // case
+  case kmp_sch_guided_simd:
+  case kmp_sch_guided_iterative_chunked: {
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
+         " case\n",
+         gtid));
+
+    if (nproc > 1) {
+      if ((2L * chunk + 1) * nproc >= tc) {
+        /* chunk size too large, switch to dynamic */
+        schedule = kmp_sch_dynamic_chunked;
+      } else {
+        // when remaining iters become less than parm2 - switch to dynamic
+        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
+        *(double *)&pr->u.p.parm3 =
+            guided_flt_param / nproc; // may occupy parm3 and parm4
+      }
+    } else {
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
+                     "kmp_sch_static_greedy\n",
+                     gtid));
+      schedule = kmp_sch_static_greedy;
+      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
+      KD_TRACE(
+          100,
+          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+           gtid));
+      pr->u.p.parm1 = tc;
+    } // if
+  } // case
+  break;
+  case kmp_sch_guided_analytical_chunked: {
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
+                   gtid));
+
+    if (nproc > 1) {
+      if ((2L * chunk + 1) * nproc >= tc) {
+        /* chunk size too large, switch to dynamic */
+        schedule = kmp_sch_dynamic_chunked;
+      } else {
+        /* commonly used term: (2 nproc - 1)/(2 nproc) */
+        DBL x;
+
+#if KMP_USE_X87CONTROL
+        /* Linux* OS already has 64-bit computation by default for long double,
+           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
+           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
+           instead of the default 53-bit. Even though long double doesn't work
+           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
+           expected to impact the correctness of the algorithm, but this has not
+           been mathematically proven. */
+        // save original FPCW and set precision to 64-bit, as
+        // Windows* OS on IA-32 architecture defaults to 53-bit
+        unsigned int oldFpcw = _control87(0, 0);
+        _control87(_PC_64, _MCW_PC); // 0,0x30000
+#endif
+        /* value used for comparison in solver for cross-over point */
+        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
+
+        /* crossover point--chunk indexes equal to or greater than
+           this point switch to dynamic-style scheduling */
+        UT cross;
+
+        /* commonly used term: (2 nproc - 1)/(2 nproc) */
+        x = (long double)1.0 - (long double)0.5 / nproc;
+
+#ifdef KMP_DEBUG
+        { // test natural alignment
+          struct _test_a {
+            char a;
+            union {
+              char b;
+              DBL d;
+            };
+          } t;
+          ptrdiff_t natural_alignment =
+              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
+          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
+          // long)natural_alignment );
+          KMP_DEBUG_ASSERT(
+              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
+        }
+#endif // KMP_DEBUG
+
+        /* save the term in thread private dispatch structure */
+        *(DBL *)&pr->u.p.parm3 = x;
+
+        /* solve for the crossover point to the nearest integer i for which C_i
+           <= chunk */
+        {
+          UT left, right, mid;
+          long double p;
+
+          /* estimate initial upper and lower bound */
+
+          /* doesn't matter what value right is as long as it is positive, but
+             it affects performance of the solver */
+          right = 229;
+          p = __kmp_pow<UT>(x, right);
+          if (p > target) {
+            do {
+              p *= p;
+              right <<= 1;
+            } while (p > target && right < (1 << 27));
+            /* lower bound is previous (failed) estimate of upper bound */
+            left = right >> 1;
+          } else {
+            left = 0;
+          }
+
+          /* bisection root-finding method */
+          while (left + 1 < right) {
+            mid = (left + right) / 2;
+            if (__kmp_pow<UT>(x, mid) > target) {
+              left = mid;
+            } else {
+              right = mid;
+            }
+          } // while
+          cross = right;
+        }
+        /* assert sanity of computed crossover point */
+        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
+                   __kmp_pow<UT>(x, cross) <= target);
+
+        /* save the crossover point in thread private dispatch structure */
+        pr->u.p.parm2 = cross;
+
+// C75803
+#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
+#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
+#else
+#define GUIDED_ANALYTICAL_WORKAROUND (x)
+#endif
+        /* dynamic-style scheduling offset */
+        pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
+                                 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
+                        cross * chunk;
+#if KMP_USE_X87CONTROL
+        // restore FPCW
+        _control87(oldFpcw, _MCW_PC);
+#endif
+      } // if
+    } else {
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
+                     "kmp_sch_static_greedy\n",
+                     gtid));
+      schedule = kmp_sch_static_greedy;
+      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
+      pr->u.p.parm1 = tc;
+    } // if
+  } // case
+  break;
+  case kmp_sch_static_greedy:
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+         gtid));
+    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
+    break;
+  case kmp_sch_static_chunked:
+  case kmp_sch_dynamic_chunked:
+    if (pr->u.p.parm1 <= 0) {
+      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
+    }
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
+                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
+                   gtid));
+    break;
+  case kmp_sch_trapezoidal: {
+    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
+
+    T parm1, parm2, parm3, parm4;
+    KD_TRACE(100,
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
+
+    parm1 = chunk;
+
+    /* F : size of the first cycle */
+    parm2 = (tc / (2 * nproc));
+
+    if (parm2 < 1) {
+      parm2 = 1;
+    }
+
+    /* L : size of the last cycle.  Make sure the last cycle is not larger
+       than the first cycle. */
+    if (parm1 < 1) {
+      parm1 = 1;
+    } else if (parm1 > parm2) {
+      parm1 = parm2;
+    }
+
+    /* N : number of cycles */
+    parm3 = (parm2 + parm1);
+    parm3 = (2 * tc + parm3 - 1) / parm3;
+
+    if (parm3 < 2) {
+      parm3 = 2;
+    }
+
+    /* sigma : decreasing incr of the trapezoid */
+    parm4 = (parm3 - 1);
+    parm4 = (parm2 - parm1) / parm4;
+
+    // pointless check, because parm4 >= 0 always
+    // if ( parm4 < 0 ) {
+    //    parm4 = 0;
+    //}
+
+    pr->u.p.parm1 = parm1;
+    pr->u.p.parm2 = parm2;
+    pr->u.p.parm3 = parm3;
+    pr->u.p.parm4 = parm4;
+  } // case
+  break;
+
+  default: {
+    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
+                KMP_HNT(GetNewerLibrary), // Hint
+                __kmp_msg_null // Variadic argument list terminator
+                );
+  } break;
+  } // switch
+  pr->schedule = schedule;
+}
+
+#if KMP_USE_HIER_SCHED
+template <typename T>
+inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
+                                             typename traits_t<T>::signed_t st);
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
+                                            kmp_int32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
+                                             kmp_uint32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
+                                            kmp_int64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
+                                             kmp_uint64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+
+// free all the hierarchy scheduling memory associated with the team
+void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
+  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
+  for (int i = 0; i < num_disp_buff; ++i) {
+    // type does not matter here so use kmp_int32
+    auto sh =
+        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+            &team->t.t_disp_buffer[i]);
+    if (sh->hier) {
+      sh->hier->deallocate();
+      __kmp_free(sh->hier);
+    }
+  }
+}
+#endif
+
+// UT - unsigned flavor of T, ST - signed flavor of T,
+// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+template <typename T>
+static void
+__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
+                    T ub, typename traits_t<T>::signed_t st,
+                    typename traits_t<T>::signed_t chunk, int push_ws) {
+  typedef typename traits_t<T>::unsigned_t UT;
+
+  int active;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  kmp_uint32 my_buffer_index;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+
+  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
+                   sizeof(dispatch_private_info));
+  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
+                   sizeof(dispatch_shared_info));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+  __kmp_resume_if_soft_paused();
+
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_DISPATCH_INIT();
+#endif
+#ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
+                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
+                            traits_t<ST>::spec, traits_t<T>::spec,
+                            traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+
+  // Any half-decent optimizer will remove this test when the blocks are empty
+  // since the macros expand to nothing
+  // when statistics are disabled.
+  if (schedule == __kmp_static) {
+    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  } else {
+    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
+  }
+
+#if KMP_USE_HIER_SCHED
+  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
+  // Hierarchical scheduling does not work with ordered, so if ordered is
+  // detected, then revert back to threaded scheduling.
+  bool ordered;
+  enum sched_type my_sched = schedule;
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
+  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
+    my_sched =
+        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
+  ordered = (kmp_ord_lower & my_sched);
+  if (pr->flags.use_hier) {
+    if (ordered) {
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
+                     "Disabling hierarchical scheduling.\n",
+                     gtid));
+      pr->flags.use_hier = FALSE;
+    }
+  }
+  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
+    // Don't use hierarchical for ordered parallel loops and don't
+    // use the runtime hierarchy if one was specified in the program
+    if (!ordered && !pr->flags.use_hier)
+      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
+  }
+#endif // KMP_USE_HIER_SCHED
+
+#if USE_ITT_BUILD
+  kmp_uint64 cur_chunk = chunk;
+  int itt_need_metadata_reporting =
+      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
+      team->t.t_active_level == 1;
+#endif
+  if (!active) {
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+  } else {
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    my_buffer_index = th->th.th_dispatch->th_disp_index++;
+
+    /* What happens when number of threads changes, need to resize buffer? */
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        &th->th.th_dispatch
+             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
+                  my_buffer_index));
+  }
+
+  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
+#if USE_ITT_BUILD
+                                &cur_chunk,
+#endif
+                                chunk, (T)th->th.th_team_nproc,
+                                (T)th->th.th_info.ds.ds_tid);
+  if (active) {
+    if (pr->flags.ordered == 0) {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
+    } else {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
+    }
+  }
+
+  if (active) {
+    /* The name of this buffer should be my_buffer_index when it's free to use
+     * it */
+
+    KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
+                   "sh->buffer_index:%d\n",
+                   gtid, my_buffer_index, sh->buffer_index));
+    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
+                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
+    // Note: KMP_WAIT() cannot be used there: buffer index and
+    // my_buffer_index are *always* 32-bit integers.
+    KMP_MB(); /* is this necessary? */
+    KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
+                   "sh->buffer_index:%d\n",
+                   gtid, my_buffer_index, sh->buffer_index));
+
+    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
+    th->th.th_dispatch->th_dispatch_sh_current =
+        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
+#if USE_ITT_BUILD
+    if (pr->flags.ordered) {
+      __kmp_itt_ordered_init(gtid);
+    }
+    // Report loop metadata
+    if (itt_need_metadata_reporting) {
+      // Only report metadata by master of active team at level 1
+      kmp_uint64 schedtype = 0;
+      switch (schedule) {
+      case kmp_sch_static_chunked:
+      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
+        break;
+      case kmp_sch_static_greedy:
+        cur_chunk = pr->u.p.parm1;
+        break;
+      case kmp_sch_dynamic_chunked:
+        schedtype = 1;
+        break;
+      case kmp_sch_guided_iterative_chunked:
+      case kmp_sch_guided_analytical_chunked:
+      case kmp_sch_guided_simd:
+        schedtype = 2;
+        break;
+      default:
+        // Should we put this case under "static"?
+        // case kmp_sch_static_steal:
+        schedtype = 3;
+        break;
+      }
+      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
+    }
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier) {
+      pr->u.p.count = 0;
+      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
+    }
+#endif // KMP_USER_HIER_SCHED
+#endif /* USE_ITT_BUILD */
+  }
+
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
+        "lb:%%%s ub:%%%s"
+        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
+        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
+        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
+        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
+        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
+                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
+                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
+                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
+    __kmp_str_free(&buff);
+  }
+#endif
+#if (KMP_STATIC_STEAL_ENABLED)
+  // It cannot be guaranteed that after execution of a loop with some other
+  // schedule kind all the parm3 variables will contain the same value. Even if
+  // all parm3 will be the same, it still exists a bad case like using 0 and 1
+  // rather than program life-time increment. So the dedicated variable is
+  // required. The 'static_steal_counter' is used.
+  if (schedule == kmp_sch_static_steal) {
+    // Other threads will inspect this variable when searching for a victim.
+    // This is a flag showing that other threads may steal from this thread
+    // since then.
+    volatile T *p = &pr->u.p.static_steal_counter;
+    *p = *p + 1;
+  }
+#endif // ( KMP_STATIC_STEAL_ENABLED )
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
+  }
+#endif
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
+}
+
+/* For ordered loops, either __kmp_dispatch_finish() should be called after
+ * every iteration, or __kmp_dispatch_finish_chunk() should be called after
+ * every chunk of iterations.  If the ordered section(s) were not executed
+ * for this iteration (or every iteration in this chunk), we need to set the
+ * ordered iteration counters so that the next thread can proceed. */
+template <typename UT>
+static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
+  typedef typename traits_t<UT>::signed_t ST;
+  kmp_info_t *th = __kmp_threads[gtid];
+
+  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
+  if (!th->th.th_team->t.t_serialized) {
+
+    dispatch_private_info_template<UT> *pr =
+        reinterpret_cast<dispatch_private_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_pr_current);
+    dispatch_shared_info_template<UT> volatile *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    KMP_DEBUG_ASSERT(pr);
+    KMP_DEBUG_ASSERT(sh);
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    if (pr->ordered_bumped) {
+      KD_TRACE(
+          1000,
+          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
+           gtid));
+      pr->ordered_bumped = 0;
+    } else {
+      UT lower = pr->u.p.ordered_lower;
+
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
+                                "ordered_iteration:%%%s lower:%%%s\n",
+                                traits_t<UT>::spec, traits_t<UT>::spec);
+        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+      KMP_MB(); /* is this necessary? */
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
+                                "ordered_iteration:%%%s lower:%%%s\n",
+                                traits_t<UT>::spec, traits_t<UT>::spec);
+        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
+    } // if
+  } // if
+  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
+}
+
+#ifdef KMP_GOMP_COMPAT
+
+template <typename UT>
+static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
+  typedef typename traits_t<UT>::signed_t ST;
+  kmp_info_t *th = __kmp_threads[gtid];
+
+  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
+  if (!th->th.th_team->t.t_serialized) {
+    //        int cid;
+    dispatch_private_info_template<UT> *pr =
+        reinterpret_cast<dispatch_private_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_pr_current);
+    dispatch_shared_info_template<UT> volatile *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    KMP_DEBUG_ASSERT(pr);
+    KMP_DEBUG_ASSERT(sh);
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
+    UT lower = pr->u.p.ordered_lower;
+    UT upper = pr->u.p.ordered_upper;
+    UT inc = upper - lower + 1;
+
+    if (pr->ordered_bumped == inc) {
+      KD_TRACE(
+          1000,
+          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
+           gtid));
+      pr->ordered_bumped = 0;
+    } else {
+      inc -= pr->ordered_bumped;
+
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
+            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
+            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
+        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+
+      KMP_MB(); /* is this necessary? */
+      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
+                      "ordered_bumped to zero\n",
+                      gtid));
+      pr->ordered_bumped = 0;
+//!!!!! TODO check if the inc should be unsigned, or signed???
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
+            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
+            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
+            traits_t<UT>::spec);
+        KD_TRACE(1000,
+                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
+    }
+    //        }
+  }
+  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
+}
+
+#endif /* KMP_GOMP_COMPAT */
+
+template <typename T>
+int __kmp_dispatch_next_algorithm(int gtid,
+                                  dispatch_private_info_template<T> *pr,
+                                  dispatch_shared_info_template<T> volatile *sh,
+                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
+                                  typename traits_t<T>::signed_t *p_st, T nproc,
+                                  T tid) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::floating_t DBL;
+  int status = 0;
+  kmp_int32 last = 0;
+  T start;
+  ST incr;
+  UT limit, trip, init;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff =
+        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
+                         "sh:%%p nproc:%%%s tid:%%%s\n",
+                         traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  // zero trip count
+  if (pr->u.p.tc == 0) {
+    KD_TRACE(10,
+             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
+              "zero status:%d\n",
+              gtid, status));
+    return 0;
+  }
+
+  switch (pr->schedule) {
+#if (KMP_STATIC_STEAL_ENABLED)
+  case kmp_sch_static_steal: {
+    T chunk = pr->u.p.parm1;
+
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
+
+    trip = pr->u.p.tc - 1;
+
+    if (traits_t<T>::type_size > 4) {
+      // use lock for 8-byte and CAS for 4-byte induction
+      // variable. TODO (optional): check and use 16-byte CAS
+      kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
+      KMP_DEBUG_ASSERT(lck != NULL);
+      if (pr->u.p.count < (UT)pr->u.p.ub) {
+        __kmp_acquire_lock(lck, gtid);
+        // try to get own chunk of iterations
+        init = (pr->u.p.count)++;
+        status = (init < (UT)pr->u.p.ub);
+        __kmp_release_lock(lck, gtid);
+      } else {
+        status = 0; // no own chunks
+      }
+      if (!status) { // try to steal
+        kmp_info_t **other_threads = team->t.t_threads;
+        int while_limit = pr->u.p.parm3;
+        int while_index = 0;
+        // TODO: algorithm of searching for a victim
+        // should be cleaned up and measured
+        while ((!status) && (while_limit != ++while_index)) {
+          T remaining;
+          T victimIdx = pr->u.p.parm4;
+          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+          dispatch_private_info_template<T> *victim =
+              reinterpret_cast<dispatch_private_info_template<T> *>(
+                  other_threads[victimIdx]
+                      ->th.th_dispatch->th_dispatch_pr_current);
+          while ((victim == NULL || victim == pr ||
+                  (*(volatile T *)&victim->u.p.static_steal_counter !=
+                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+                 oldVictimIdx != victimIdx) {
+            victimIdx = (victimIdx + 1) % nproc;
+            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+                other_threads[victimIdx]
+                    ->th.th_dispatch->th_dispatch_pr_current);
+          }
+          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+            continue; // try once more (nproc attempts in total)
+            // no victim is ready yet to participate in stealing
+            // because all victims are still in kmp_init_dispatch
+          }
+          if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
+            pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
+            continue; // not enough chunks to steal, goto next victim
+          }
+
+          lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+          KMP_ASSERT(lck != NULL);
+          __kmp_acquire_lock(lck, gtid);
+          limit = victim->u.p.ub; // keep initial ub
+          if (victim->u.p.count >= limit ||
+              (remaining = limit - victim->u.p.count) < 2) {
+            __kmp_release_lock(lck, gtid);
+            pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
+            continue; // not enough chunks to steal
+          }
+          // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
+          // by 1
+          if (remaining > 3) {
+            // steal 1/4 of remaining
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
+            init = (victim->u.p.ub -= (remaining >> 2));
+          } else {
+            // steal 1 chunk of 2 or 3 remaining
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
+            init = (victim->u.p.ub -= 1);
+          }
+          __kmp_release_lock(lck, gtid);
+
+          KMP_DEBUG_ASSERT(init + 1 <= limit);
+          pr->u.p.parm4 = victimIdx; // remember victim to steal from
+          status = 1;
+          while_index = 0;
+          // now update own count and ub with stolen range but init chunk
+          __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          pr->u.p.count = init + 1;
+          pr->u.p.ub = limit;
+          __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } else {
+      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
+      typedef union {
+        struct {
+          UT count;
+          T ub;
+        } p;
+        kmp_int64 b;
+      } union_i4;
+      // All operations on 'count' or 'ub' must be combined atomically
+      // together.
+      {
+        union_i4 vold, vnew;
+        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+        vnew = vold;
+        vnew.p.count++;
+        while (!KMP_COMPARE_AND_STORE_ACQ64(
+            (volatile kmp_int64 *)&pr->u.p.count,
+            *VOLATILE_CAST(kmp_int64 *) & vold.b,
+            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+          KMP_CPU_PAUSE();
+          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+          vnew = vold;
+          vnew.p.count++;
+        }
+        vnew = vold;
+        init = vnew.p.count;
+        status = (init < (UT)vnew.p.ub);
+      }
+
+      if (!status) {
+        kmp_info_t **other_threads = team->t.t_threads;
+        int while_limit = pr->u.p.parm3;
+        int while_index = 0;
+
+        // TODO: algorithm of searching for a victim
+        // should be cleaned up and measured
+        while ((!status) && (while_limit != ++while_index)) {
+          union_i4 vold, vnew;
+          kmp_int32 remaining;
+          T victimIdx = pr->u.p.parm4;
+          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+          dispatch_private_info_template<T> *victim =
+              reinterpret_cast<dispatch_private_info_template<T> *>(
+                  other_threads[victimIdx]
+                      ->th.th_dispatch->th_dispatch_pr_current);
+          while ((victim == NULL || victim == pr ||
+                  (*(volatile T *)&victim->u.p.static_steal_counter !=
+                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+                 oldVictimIdx != victimIdx) {
+            victimIdx = (victimIdx + 1) % nproc;
+            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+                other_threads[victimIdx]
+                    ->th.th_dispatch->th_dispatch_pr_current);
+          }
+          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+            continue; // try once more (nproc attempts in total)
+            // no victim is ready yet to participate in stealing
+            // because all victims are still in kmp_init_dispatch
+          }
+          pr->u.p.parm4 = victimIdx; // new victim found
+          while (1) { // CAS loop if victim has enough chunks to steal
+            vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
+            vnew = vold;
+
+            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+            if (vnew.p.count >= (UT)vnew.p.ub ||
+                (remaining = vnew.p.ub - vnew.p.count) < 2) {
+              pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
+              break; // not enough chunks to steal, goto next victim
+            }
+            if (remaining > 3) {
+              vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
+            } else {
+              vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
+            }
+            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+            // TODO: Should this be acquire or release?
+            if (KMP_COMPARE_AND_STORE_ACQ64(
+                    (volatile kmp_int64 *)&victim->u.p.count,
+                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
+                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+              // stealing succedded
+              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
+                                        vold.p.ub - vnew.p.ub);
+              status = 1;
+              while_index = 0;
+              // now update own count and ub
+              init = vnew.p.ub;
+              vold.p.count = init + 1;
+#if KMP_ARCH_X86
+              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
+#else
+              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
+#endif
+              break;
+            } // if (check CAS result)
+            KMP_CPU_PAUSE(); // CAS failed, repeate attempt
+          } // while (try to steal from particular victim)
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } // if (4-byte induction variable)
+    if (!status) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.parm2;
+      init *= chunk;
+      limit = chunk + init - 1;
+      incr = pr->u.p.st;
+      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
+
+      KMP_DEBUG_ASSERT(init <= trip);
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+    break;
+  } // case
+#endif // ( KMP_STATIC_STEAL_ENABLED )
+  case kmp_sch_static_balanced: {
+    KD_TRACE(
+        10,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
+    /* check if thread has any iteration to do */
+    if ((status = !pr->u.p.count) != 0) {
+      pr->u.p.count = 1;
+      *p_lb = pr->u.p.lb;
+      *p_ub = pr->u.p.ub;
+      last = pr->u.p.parm1;
+      if (p_st != NULL)
+        *p_st = pr->u.p.st;
+    } else { /* no iterations to do */
+      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
+    }
+  } // case
+  break;
+  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
+                                 merged here */
+  case kmp_sch_static_chunked: {
+    T parm1;
+
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_static_[affinity|chunked] case\n",
+                   gtid));
+    parm1 = pr->u.p.parm1;
+
+    trip = pr->u.p.tc - 1;
+    init = parm1 * (pr->u.p.count + tid);
+
+    if ((status = (init <= trip)) != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      limit = parm1 + init - 1;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      pr->u.p.count += nproc;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_dynamic_chunked: {
+    T chunk = pr->u.p.parm1;
+
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
+         gtid));
+
+    init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+    trip = pr->u.p.tc - 1;
+
+    if ((status = (init <= trip)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      limit = chunk + init - 1;
+      incr = pr->u.p.st;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_iterative_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
+                   "iterative case\n",
+                   gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        // nothing to do, don't try atomic op
+        status = 0;
+        break;
+      }
+      if ((T)remaining <
+          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
+        // use dynamic-style shcedule
+        // atomically inrement iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunkspec);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunkspec) {
+            limit = init + chunkspec - 1;
+          } else {
+            last = 1; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      limit = init +
+              (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_simd: {
+    // same as iterative but curr-chunk adjusted to be multiple of given
+    // chunk
+    T chunk = pr->u.p.parm1;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
+              gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        status = 0; // nothing to do, don't try atomic op
+        break;
+      }
+      KMP_DEBUG_ASSERT(init % chunk == 0);
+      // compare with K*nproc*(chunk+1), K=2 by default
+      if ((T)remaining < pr->u.p.parm2) {
+        // use dynamic-style shcedule
+        // atomically inrement iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunk);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunk) {
+            limit = init + chunk - 1;
+          } else {
+            last = 1; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      // divide by K*nproc
+      UT span = remaining * (*(double *)&pr->u.p.parm3);
+      UT rem = span % chunk;
+      if (rem) // adjust so that span%chunk == 0
+        span += chunk - rem;
+      limit = init + span;
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_analytical_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    UT chunkIdx;
+#if KMP_USE_X87CONTROL
+    /* for storing original FPCW value for Windows* OS on
+       IA-32 architecture 8-byte version */
+    unsigned int oldFpcw;
+    unsigned int fpcwSet = 0;
+#endif
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
+                   gtid));
+
+    trip = pr->u.p.tc;
+
+    KMP_DEBUG_ASSERT(nproc > 1);
+    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
+
+    while (1) { /* this while loop is a safeguard against unexpected zero
+                   chunk sizes */
+      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+      if (chunkIdx >= (UT)pr->u.p.parm2) {
+        --trip;
+        /* use dynamic-style scheduling */
+        init = chunkIdx * chunkspec + pr->u.p.count;
+        /* need to verify init > 0 in case of overflow in the above
+         * calculation */
+        if ((status = (init > 0 && init <= trip)) != 0) {
+          limit = init + chunkspec - 1;
+
+          if ((last = (limit >= trip)) != 0)
+            limit = trip;
+        }
+        break;
+      } else {
+/* use exponential-style scheduling */
+/* The following check is to workaround the lack of long double precision on
+   Windows* OS.
+   This check works around the possible effect that init != 0 for chunkIdx == 0.
+ */
+#if KMP_USE_X87CONTROL
+        /* If we haven't already done so, save original
+           FPCW and set precision to 64-bit, as Windows* OS
+           on IA-32 architecture defaults to 53-bit */
+        if (!fpcwSet) {
+          oldFpcw = _control87(0, 0);
+          _control87(_PC_64, _MCW_PC);
+          fpcwSet = 0x30000;
+        }
+#endif
+        if (chunkIdx) {
+          init = __kmp_dispatch_guided_remaining<T>(
+              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
+          KMP_DEBUG_ASSERT(init);
+          init = trip - init;
+        } else
+          init = 0;
+        limit = trip - __kmp_dispatch_guided_remaining<T>(
+                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
+        KMP_ASSERT(init <= limit);
+        if (init < limit) {
+          KMP_DEBUG_ASSERT(limit <= trip);
+          --limit;
+          status = 1;
+          break;
+        } // if
+      } // if
+    } // while (1)
+#if KMP_USE_X87CONTROL
+    /* restore FPCW if necessary
+       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
+    */
+    if (fpcwSet && (oldFpcw & fpcwSet))
+      _control87(oldFpcw, _MCW_PC);
+#endif
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      }
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    }
+  } // case
+  break;
+
+  case kmp_sch_trapezoidal: {
+    UT index;
+    T parm2 = pr->u.p.parm2;
+    T parm3 = pr->u.p.parm3;
+    T parm4 = pr->u.p.parm4;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
+
+    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
+
+    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
+    trip = pr->u.p.tc - 1;
+
+    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
+      incr = pr->u.p.st;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+  default: {
+    status = 0; // to avoid complaints on uninitialized variable use
+    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
+                KMP_HNT(GetNewerLibrary), // Hint
+                __kmp_msg_null // Variadic argument list terminator
+                );
+  } break;
+  } // switch
+  if (p_last)
+    *p_last = last;
+#ifdef KMP_DEBUG
+  if (pr->flags.ordered) {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                            "ordered_lower:%%%s ordered_upper:%%%s\n",
+                            traits_t<UT>::spec, traits_t<UT>::spec);
+    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
+    __kmp_str_free(&buff);
+  }
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  return status;
+}
+
+/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
+   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
+   is not called. */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_LOOP_END                                                          \
+  if (status == 0) {                                                           \
+    if (ompt_enabled.ompt_callback_work) {                                     \
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
+      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
+          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
+          &(task_info->task_data), 0, codeptr);                                \
+    }                                                                          \
+  }
+// TODO: implement count
+#else
+#define OMPT_LOOP_END // no-op
+#endif
+
+#if KMP_STATS_ENABLED
+#define KMP_STATS_LOOP_END                                                     \
+  {                                                                            \
+    kmp_int64 u, l, t, i;                                                      \
+    l = (kmp_int64)(*p_lb);                                                    \
+    u = (kmp_int64)(*p_ub);                                                    \
+    i = (kmp_int64)(pr->u.p.st);                                               \
+    if (status == 0) {                                                         \
+      t = 0;                                                                   \
+      KMP_POP_PARTITIONED_TIMER();                                             \
+    } else if (i == 1) {                                                       \
+      if (u >= l)                                                              \
+        t = u - l + 1;                                                         \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else if (i < 0) {                                                        \
+      if (l >= u)                                                              \
+        t = (l - u) / (-i) + 1;                                                \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else {                                                                   \
+      if (u >= l)                                                              \
+        t = (u - l) / i + 1;                                                   \
+      else                                                                     \
+        t = 0;                                                                 \
+    }                                                                          \
+    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
+  }
+#else
+#define KMP_STATS_LOOP_END /* Nothing */
+#endif
+
+template <typename T>
+static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
+                               T *p_lb, T *p_ub,
+                               typename traits_t<T>::signed_t *p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                               ,
+                               void *codeptr
+#endif
+                               ) {
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  // This is potentially slightly misleading, schedule(runtime) will appear here
+  // even if the actual runtme schedule is static. (Which points out a
+  // disadavantage of schedule(runtime): even when static scheduling is used it
+  // costs more than a compile time choice to use static scheduling would.)
+  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
+
+  int status;
+  dispatch_private_info_template<T> *pr;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+
+  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
+  KD_TRACE(
+      1000,
+      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
+       gtid, p_lb, p_ub, p_st, p_last));
+
+  if (team->t.t_serialized) {
+    /* NOTE: serialize this dispatch becase we are not at the active level */
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+    KMP_DEBUG_ASSERT(pr);
+
+    if ((status = (pr->u.p.tc != 0)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      //            if ( p_last != NULL )
+      //                *p_last = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+      if (__kmp_env_consistency_check) {
+        if (pr->pushed_ws != ct_none) {
+          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+        }
+      }
+    } else if (pr->flags.nomerge) {
+      kmp_int32 last;
+      T start;
+      UT limit, trip, init;
+      ST incr;
+      T chunk = pr->u.p.parm1;
+
+      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
+                     gtid));
+
+      init = chunk * pr->u.p.count++;
+      trip = pr->u.p.tc - 1;
+
+      if ((status = (init <= trip)) == 0) {
+        *p_lb = 0;
+        *p_ub = 0;
+        //                if ( p_last != NULL )
+        //                    *p_last = 0;
+        if (p_st != NULL)
+          *p_st = 0;
+        if (__kmp_env_consistency_check) {
+          if (pr->pushed_ws != ct_none) {
+            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+          }
+        }
+      } else {
+        start = pr->u.p.lb;
+        limit = chunk + init - 1;
+        incr = pr->u.p.st;
+
+        if ((last = (limit >= trip)) != 0) {
+          limit = trip;
+#if KMP_OS_WINDOWS
+          pr->u.p.last_upper = pr->u.p.ub;
+#endif /* KMP_OS_WINDOWS */
+        }
+        if (p_last != NULL)
+          *p_last = last;
+        if (p_st != NULL)
+          *p_st = incr;
+        if (incr == 1) {
+          *p_lb = start + init;
+          *p_ub = start + limit;
+        } else {
+          *p_lb = start + init * incr;
+          *p_ub = start + limit * incr;
+        }
+
+        if (pr->flags.ordered) {
+          pr->u.p.ordered_lower = init;
+          pr->u.p.ordered_upper = limit;
+#ifdef KMP_DEBUG
+          {
+            char *buff;
+            // create format specifiers before the debug output
+            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
+                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
+                                    traits_t<UT>::spec, traits_t<UT>::spec);
+            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
+                            pr->u.p.ordered_upper));
+            __kmp_str_free(&buff);
+          }
+#endif
+        } // if
+      } // if
+    } else {
+      pr->u.p.tc = 0;
+      *p_lb = pr->u.p.lb;
+      *p_ub = pr->u.p.ub;
+#if KMP_OS_WINDOWS
+      pr->u.p.last_upper = *p_ub;
+#endif /* KMP_OS_WINDOWS */
+      if (p_last != NULL)
+        *p_last = TRUE;
+      if (p_st != NULL)
+        *p_st = pr->u.p.st;
+    } // if
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format(
+          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
+          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
+          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
+      __kmp_str_free(&buff);
+    }
+#endif
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_DISPATCH_NEXT();
+#endif
+    OMPT_LOOP_END;
+    KMP_STATS_LOOP_END;
+    return status;
+  } else {
+    kmp_int32 last = 0;
+    dispatch_shared_info_template<T> volatile *sh;
+
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    KMP_DEBUG_ASSERT(pr);
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+        th->th.th_dispatch->th_dispatch_sh_current);
+    KMP_DEBUG_ASSERT(sh);
+
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier)
+      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
+    else
+#endif // KMP_USE_HIER_SCHED
+      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
+                                                p_st, th->th.th_team_nproc,
+                                                th->th.th_info.ds.ds_tid);
+    // status == 0: no more iterations to execute
+    if (status == 0) {
+      UT num_done;
+
+      num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
+#ifdef KMP_DEBUG
+      {
+        char *buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
+            traits_t<UT>::spec);
+        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
+        __kmp_str_free(&buff);
+      }
+#endif
+
+#if KMP_USE_HIER_SCHED
+      pr->flags.use_hier = FALSE;
+#endif
+      if ((ST)num_done == th->th.th_team_nproc - 1) {
+#if (KMP_STATIC_STEAL_ENABLED)
+        if (pr->schedule == kmp_sch_static_steal &&
+            traits_t<T>::type_size > 4) {
+          int i;
+          kmp_info_t **other_threads = team->t.t_threads;
+          // loop complete, safe to destroy locks used for stealing
+          for (i = 0; i < th->th.th_team_nproc; ++i) {
+            kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
+            KMP_ASSERT(lck != NULL);
+            __kmp_destroy_lock(lck);
+            __kmp_free(lck);
+            other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
+          }
+        }
+#endif
+        /* NOTE: release this buffer to be reused */
+
+        KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+        sh->u.s.num_done = 0;
+        sh->u.s.iteration = 0;
+
+        /* TODO replace with general release procedure? */
+        if (pr->flags.ordered) {
+          sh->u.s.ordered_iteration = 0;
+        }
+
+        KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+        sh->buffer_index += __kmp_dispatch_num_buffers;
+        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
+                       gtid, sh->buffer_index));
+
+        KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+      } // if
+      if (__kmp_env_consistency_check) {
+        if (pr->pushed_ws != ct_none) {
+          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+        }
+      }
+
+      th->th.th_dispatch->th_deo_fcn = NULL;
+      th->th.th_dispatch->th_dxo_fcn = NULL;
+      th->th.th_dispatch->th_dispatch_sh_current = NULL;
+      th->th.th_dispatch->th_dispatch_pr_current = NULL;
+    } // if (status == 0)
+#if KMP_OS_WINDOWS
+    else if (last) {
+      pr->u.p.last_upper = pr->u.p.ub;
+    }
+#endif /* KMP_OS_WINDOWS */
+    if (p_last != NULL && status != 0)
+      *p_last = last;
+  } // if
+
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_next: T#%%d normal case: "
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
+                  (p_last ? *p_last : 0), status));
+    __kmp_str_free(&buff);
+  }
+#endif
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_DISPATCH_NEXT();
+#endif
+  OMPT_LOOP_END;
+  KMP_STATS_LOOP_END;
+  return status;
+}
+
+template <typename T>
+static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
+                                  kmp_int32 *plastiter, T *plower, T *pupper,
+                                  typename traits_t<T>::signed_t incr) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  kmp_uint32 team_id;
+  kmp_uint32 nteams;
+  UT trip_count;
+  kmp_team_t *team;
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
+  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
+#ifdef KMP_DEBUG
+  typedef typename traits_t<T>::signed_t ST;
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
+                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
+      // The loop is illegal.
+      // Some zero-trip loops maintained by compiler, e.g.:
+      //   for(i=10;i<0;++i) // lower >= upper - run-time check
+      //   for(i=0;i>10;--i) // lower <= upper - run-time check
+      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+      // Compiler does not check the following illegal loops:
+      //   for(i=0;i<10;i+=incr) // where incr<0
+      //   for(i=10;i>0;i-=incr) // where incr<0
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
+    }
+  }
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
+  nteams = th->th.th_teams_size.nteams;
+  team_id = team->t.t_master_tid;
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
+
+  // compute global trip count
+  if (incr == 1) {
+    trip_count = *pupper - *plower + 1;
+  } else if (incr == -1) {
+    trip_count = *plower - *pupper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(*pupper - *plower) / incr + 1;
+  } else {
+    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
+  }
+
+  if (trip_count <= nteams) {
+    KMP_DEBUG_ASSERT(
+        __kmp_static == kmp_sch_static_greedy ||
+        __kmp_static ==
+            kmp_sch_static_balanced); // Unknown static scheduling type.
+    // only some teams get single iteration, others get nothing
+    if (team_id < trip_count) {
+      *pupper = *plower = *plower + team_id * incr;
+    } else {
+      *plower = *pupper + incr; // zero-trip loop
+    }
+    if (plastiter != NULL)
+      *plastiter = (team_id == trip_count - 1);
+  } else {
+    if (__kmp_static == kmp_sch_static_balanced) {
+      UT chunk = trip_count / nteams;
+      UT extras = trip_count % nteams;
+      *plower +=
+          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
+      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
+      if (plastiter != NULL)
+        *plastiter = (team_id == nteams - 1);
+    } else {
+      T chunk_inc_count =
+          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
+      T upper = *pupper;
+      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+      // Unknown static scheduling type.
+      *plower += team_id * chunk_inc_count;
+      *pupper = *plower + chunk_inc_count - incr;
+      // Check/correct bounds if needed
+      if (incr > 0) {
+        if (*pupper < *plower)
+          *pupper = traits_t<T>::max_value;
+        if (plastiter != NULL)
+          *plastiter = *plower <= upper && *pupper > upper - incr;
+        if (*pupper > upper)
+          *pupper = upper; // tracker C73258
+      } else {
+        if (*pupper > *plower)
+          *pupper = traits_t<T>::min_value;
+        if (plastiter != NULL)
+          *plastiter = *plower >= upper && *pupper < upper - incr;
+        if (*pupper < upper)
+          *pupper = upper; // tracker C73258
+      }
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Dispatch routines
+//    Transfer call to template< type T >
+//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
+//                         T lb, T ub, ST st, ST chunk )
+extern "C" {
+
+/*!
+@ingroup WORK_SHARING
+@{
+@param loc Source location
+@param gtid Global thread id
+@param schedule Schedule type
+@param lb  Lower bound
+@param ub  Upper bound
+@param st  Step (or increment if you prefer)
+@param chunk The chunk size to block with
+
+This function prepares the runtime to start a dynamically scheduled for loop,
+saving the loop arguments.
+These functions are all identical apart from the types of the arguments.
+*/
+
+void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                            enum sched_type schedule, kmp_int32 lb,
+                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                             enum sched_type schedule, kmp_uint32 lb,
+                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                            enum sched_type schedule, kmp_int64 lb,
+                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+*/
+void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                             enum sched_type schedule, kmp_uint64 lb,
+                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+See @ref __kmpc_dispatch_init_4
+
+Difference from __kmpc_dispatch_init set of functions is these functions
+are called for composite distribute parallel for construct. Thus before
+regular iterations dispatching we need to calc per-team iteration space.
+
+These functions are all identical apart from the types of the arguments.
+*/
+void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                                 enum sched_type schedule, kmp_int32 *p_last,
+                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
+                                 kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                  enum sched_type schedule, kmp_int32 *p_last,
+                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
+                                  kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                                 enum sched_type schedule, kmp_int32 *p_last,
+                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
+                                 kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                  enum sched_type schedule, kmp_int32 *p_last,
+                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
+                                  kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
+  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+@param p_last Pointer to a flag set to one if this is the last chunk or zero
+otherwise
+@param p_lb   Pointer to the lower bound for the next chunk of work
+@param p_ub   Pointer to the upper bound for the next chunk of work
+@param p_st   Pointer to the stride for the next chunk of work
+@return one if there is work to be done, zero otherwise
+
+Get the next dynamically allocated chunk of work for this thread.
+If there is no more work, then the lb,ub and stride need not be modified.
+*/
+int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                        ,
+                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                            );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
+                            kmp_int32 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                         ,
+                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                             );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                        ,
+                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                            );
+}
+
+/*!
+See @ref __kmpc_dispatch_next_4
+*/
+int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
+                            kmp_int64 *p_st) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                         ,
+                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
+#endif
+                                             );
+}
+
+/*!
+@param loc Source code location
+@param gtid Global thread id
+
+Mark the end of a dynamic loop.
+*/
+void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
+}
+
+/*!
+See @ref __kmpc_dispatch_fini_4
+*/
+void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
+}
+/*! @} */
+
+//-----------------------------------------------------------------------------
+// Non-template routines from kmp_dispatch.cpp used in other sources
+
+kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value == checker;
+}
+
+kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value != checker;
+}
+
+kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value < checker;
+}
+
+kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value >= checker;
+}
+
+kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
+  return value <= checker;
+}
+
+kmp_uint32
+__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
+             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
+             void *obj // Higher-level synchronization object, or NULL.
+             ) {
+  // note: we may not belong to a team at this point
+  volatile kmp_uint32 *spin = spinner;
+  kmp_uint32 check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
+  kmp_uint32 r;
+
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
+  KMP_INIT_YIELD(spins);
+  // main wait spin loop
+  while (!f(r = TCR_4(*spin), check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* GEH - remove this since it was accidentally introduced when kmp_wait was
+       split. It causes problems with infinite recursion because of exit lock */
+    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+        __kmp_abort_thread(); */
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+  return r;
+}
+
+void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
+                      kmp_uint32 (*pred)(void *, kmp_uint32),
+                      void *obj // Higher-level synchronization object, or NULL.
+                      ) {
+  // note: we may not belong to a team at this point
+  void *spin = spinner;
+  kmp_uint32 check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
+
+  KMP_FSYNC_SPIN_INIT(obj, spin);
+  KMP_INIT_YIELD(spins);
+  // main wait spin loop
+  while (!f(spin, check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* if we have waited a bit, or are noversubscribed, yield */
+    /* pause is in the following code */
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+}
+
+} // extern "C"
+
+#ifdef KMP_GOMP_COMPAT
+
+void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
+                               enum sched_type schedule, kmp_int32 lb,
+                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
+                               int push_ws) {
+  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
+                                 push_ws);
+}
+
+void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
+                                enum sched_type schedule, kmp_uint32 lb,
+                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
+                                int push_ws) {
+  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
+                                  push_ws);
+}
+
+void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
+                               enum sched_type schedule, kmp_int64 lb,
+                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
+                               int push_ws) {
+  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
+                                 push_ws);
+}
+
+void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
+                                enum sched_type schedule, kmp_uint64 lb,
+                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
+                                int push_ws) {
+  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
+                                  push_ws);
+}
+
+void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
+}
+
+void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
+}
+
+void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
+}
+
+void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
+  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
+}
+
+#endif /* KMP_GOMP_COMPAT */
+
+/* ------------------------------------------------------------------------ */
diff --git a/final/runtime/src/kmp_dispatch.h b/final/runtime/src/kmp_dispatch.h
new file mode 100644
index 0000000..8b3e984
--- /dev/null
+++ b/final/runtime/src/kmp_dispatch.h
@@ -0,0 +1,506 @@
+/*
+ * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DISPATCH_H
+#define KMP_DISPATCH_H
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+#include <float.h>
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+#if KMP_USE_HIER_SCHED
+// Forward declarations of some hierarchical scheduling data structures
+template <typename T> struct kmp_hier_t;
+template <typename T> struct kmp_hier_top_unit_t;
+#endif // KMP_USE_HIER_SCHED
+
+template <typename T> struct dispatch_shared_info_template;
+template <typename T> struct dispatch_private_info_template;
+
+template <typename T>
+extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                          dispatch_private_info_template<T> *pr,
+                                          enum sched_type schedule, T lb, T ub,
+                                          typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                          kmp_uint64 *cur_chunk,
+#endif
+                                          typename traits_t<T>::signed_t chunk,
+                                          T nproc, T unit_id);
+template <typename T>
+extern int __kmp_dispatch_next_algorithm(
+    int gtid, dispatch_private_info_template<T> *pr,
+    dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
+    T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
+
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+
+#if KMP_STATIC_STEAL_ENABLED
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  UT count; // unsigned
+  T ub;
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  T lb;
+  ST st; // signed
+  UT tc; // unsigned
+  T static_steal_counter; // for static_steal only; maybe better to put after ub
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+
+  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
+    T parm1;
+    T parm2;
+    T parm3;
+    T parm4;
+  };
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+
+#else /* KMP_STATIC_STEAL_ENABLED */
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  T lb;
+  T ub;
+  ST st; // signed
+  UT tc; // unsigned
+
+  T parm1;
+  T parm2;
+  T parm3;
+  T parm4;
+
+  UT count; // unsigned
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
+  // duplicate alignment here, otherwise size of structure is not correct in our
+  // compiler
+  union KMP_ALIGN_CACHE private_info_tmpl {
+    dispatch_private_infoXX_template<T> p;
+    dispatch_private_info64_t p64;
+  } u;
+  enum sched_type schedule; /* scheduling algorithm */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  kmp_uint32 ordered_bumped;
+  // to retain the structure size after making order
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
+  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
+  kmp_uint32 type_size;
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  kmp_hier_top_unit_t<T> *hier_parent;
+  // member functions
+  kmp_int32 get_hier_id() const { return hier_id; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+#endif
+  enum cons_type pushed_ws;
+};
+
+// replaces dispatch_shared_info{32,64} structures and
+// dispatch_shared_info{32,64}_t types
+template <typename T> struct dispatch_shared_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile UT iteration;
+  volatile UT num_done;
+  volatile UT ordered_iteration;
+  // to retain the structure size making ordered_iteration scalar
+  UT ordered_dummy[KMP_MAX_ORDERED - 3];
+};
+
+// replaces dispatch_shared_info structure and dispatch_shared_info_t type
+template <typename T> struct dispatch_shared_info_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  // we need union here to keep the structure size
+  union shared_info_tmpl {
+    dispatch_shared_infoXX_template<UT> s;
+    dispatch_shared_info64_t s64;
+  } u;
+  volatile kmp_uint32 buffer_index;
+  volatile kmp_int32 doacross_buf_idx; // teamwise index
+  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
+  kmp_int32 doacross_num_done; // count finished threads
+#if KMP_USE_HIER_SCHED
+  kmp_hier_t<T> *hier;
+#endif
+#if KMP_USE_HWLOC
+  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
+  // machines (> 48 cores). Performance analysis showed that a cache thrash
+  // was occurring and this padding helps alleviate the problem.
+  char padding[64];
+#endif
+};
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#undef USE_TEST_LOCKS
+
+// test_then_add template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
+
+template <>
+__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
+                                                 kmp_int32 d) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_ADD32(p, d);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
+                                                 kmp_int64 d) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_ADD64(p, d);
+  return r;
+}
+
+// test_then_inc_acq template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC_ACQ32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC_ACQ64(p);
+  return r;
+}
+
+// test_then_inc template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC64(p);
+  return r;
+}
+
+// compare_and_swap template (general template should NOT be used)
+template <typename T>
+static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
+                                                    kmp_int32 c, kmp_int32 s) {
+  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
+}
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
+                                                    kmp_int64 c, kmp_int64 s) {
+  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
+}
+
+template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
+  return value >= checker;
+}
+template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
+  return value == checker;
+}
+
+/*
+    Spin wait loop that pauses between checks.
+    Waits until function returns non-zero when called with *spinner and check.
+    Does NOT put threads to sleep.
+    Arguments:
+        UT is unsigned 4- or 8-byte type
+        spinner - memory location to check value
+        checker - value which spinner is >, <, ==, etc.
+        pred - predicate function to perform binary comparison of some sort
+#if USE_ITT_BUILD
+        obj -- is higher-level synchronization object to report to ittnotify. It
+        is used to report locks consistently. For example, if lock is acquired
+        immediately, its address is reported to ittnotify via
+        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
+        and lock routine calls to KMP_WAIT(), the later should report the
+        same address, not an address of low-level spinner.
+#endif // USE_ITT_BUILD
+    TODO: make inline function (move to header file for icl)
+*/
+template <typename UT>
+static UT __kmp_wait(volatile UT *spinner, UT checker,
+                     kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
+  // note: we may not belong to a team at this point
+  volatile UT *spin = spinner;
+  UT check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(UT, UT) = pred;
+  UT r;
+
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
+  KMP_INIT_YIELD(spins);
+  // main wait spin loop
+  while (!f(r = *spin, check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* GEH - remove this since it was accidentally introduced when kmp_wait was
+       split.
+       It causes problems with infinite recursion because of exit lock */
+    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+        __kmp_abort_thread(); */
+    // If oversubscribed, or have waited a bit then yield.
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+  return r;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+template <typename UT>
+void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
+#endif
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    UT lower;
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+    lower = pr->u.p.ordered_lower;
+
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB();
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+    __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
+                   __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+    KMP_MB(); /* is this necessary? */
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+  }
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
+}
+
+template <typename UT>
+void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  typedef typename traits_t<UT>::signed_t ST;
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+
+    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped != 0) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        /* How to test it? - OM */
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    pr->ordered_bumped += 1;
+
+    KD_TRACE(1000,
+             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
+              gtid, pr->ordered_bumped));
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    /* TODO use general release procedure? */
+    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
+}
+
+/* Computes and returns x to the power of y, where y must a non-negative integer
+ */
+template <typename UT>
+static __forceinline long double __kmp_pow(long double x, UT y) {
+  long double s = 1.0L;
+
+  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
+  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
+  while (y) {
+    if (y & 1)
+      s *= x;
+    x *= x;
+    y >>= 1;
+  }
+  return s;
+}
+
+/* Computes and returns the number of unassigned iterations after idx chunks
+   have been assigned
+   (the total number of unassigned iterations in chunks with index greater than
+   or equal to idx).
+   __forceinline seems to be broken so that if we __forceinline this function,
+   the behavior is wrong
+   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
+*/
+template <typename T>
+static __inline typename traits_t<T>::unsigned_t
+__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
+                                typename traits_t<T>::unsigned_t idx) {
+  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
+     least for ICL 8.1, long double arithmetic may not really have
+     long double precision, even with /Qlong_double.  Currently, we
+     workaround that in the caller code, by manipulating the FPCW for
+     Windows* OS on IA-32 architecture.  The lack of precision is not
+     expected to be a correctness issue, though.
+  */
+  typedef typename traits_t<T>::unsigned_t UT;
+
+  long double x = tc * __kmp_pow<UT>(base, idx);
+  UT r = (UT)x;
+  if (x == r)
+    return r;
+  return r + 1;
+}
+
+// Parameters of the guided-iterative algorithm:
+//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
+//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
+// by default n = 2. For example with n = 3 the chunks distribution will be more
+// flat.
+// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
+static const int guided_int_param = 2;
+static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
+#endif // KMP_DISPATCH_H
diff --git a/final/runtime/src/kmp_dispatch_hier.h b/final/runtime/src/kmp_dispatch_hier.h
new file mode 100644
index 0000000..24a6d66
--- /dev/null
+++ b/final/runtime/src/kmp_dispatch_hier.h
@@ -0,0 +1,1106 @@
+/*
+ * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DISPATCH_HIER_H
+#define KMP_DISPATCH_HIER_H
+#include "kmp.h"
+#include "kmp_dispatch.h"
+
+// Layer type for scheduling hierarchy
+enum kmp_hier_layer_e {
+  LAYER_THREAD = -1,
+  LAYER_L1,
+  LAYER_L2,
+  LAYER_L3,
+  LAYER_NUMA,
+  LAYER_LOOP,
+  LAYER_LAST
+};
+
+// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
+static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
+  switch (type) {
+  case kmp_hier_layer_e::LAYER_THREAD:
+    return "THREAD";
+  case kmp_hier_layer_e::LAYER_L1:
+    return "L1";
+  case kmp_hier_layer_e::LAYER_L2:
+    return "L2";
+  case kmp_hier_layer_e::LAYER_L3:
+    return "L3";
+  case kmp_hier_layer_e::LAYER_NUMA:
+    return "NUMA";
+  case kmp_hier_layer_e::LAYER_LOOP:
+    return "WHOLE_LOOP";
+  case kmp_hier_layer_e::LAYER_LAST:
+    return "LAST";
+  }
+  KMP_ASSERT(0);
+  // Appease compilers, should never get here
+  return "ERROR";
+}
+
+// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
+typedef struct kmp_hier_sched_env_t {
+  int size;
+  int capacity;
+  enum sched_type *scheds;
+  kmp_int32 *small_chunks;
+  kmp_int64 *large_chunks;
+  kmp_hier_layer_e *layers;
+  // Append a level of the hierarchy
+  void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
+    if (capacity == 0) {
+      scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
+                                                  kmp_hier_layer_e::LAYER_LAST);
+      capacity = kmp_hier_layer_e::LAYER_LAST;
+    }
+    int current_size = size;
+    KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
+    scheds[current_size] = sched;
+    layers[current_size] = layer;
+    small_chunks[current_size] = chunk;
+    large_chunks[current_size] = (kmp_int64)chunk;
+    size++;
+  }
+  // Sort the hierarchy using selection sort, size will always be small
+  // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
+  void sort() {
+    if (size <= 1)
+      return;
+    for (int i = 0; i < size; ++i) {
+      int switch_index = i;
+      for (int j = i + 1; j < size; ++j) {
+        if (layers[j] < layers[switch_index])
+          switch_index = j;
+      }
+      if (switch_index != i) {
+        kmp_hier_layer_e temp1 = layers[i];
+        enum sched_type temp2 = scheds[i];
+        kmp_int32 temp3 = small_chunks[i];
+        kmp_int64 temp4 = large_chunks[i];
+        layers[i] = layers[switch_index];
+        scheds[i] = scheds[switch_index];
+        small_chunks[i] = small_chunks[switch_index];
+        large_chunks[i] = large_chunks[switch_index];
+        layers[switch_index] = temp1;
+        scheds[switch_index] = temp2;
+        small_chunks[switch_index] = temp3;
+        large_chunks[switch_index] = temp4;
+      }
+    }
+  }
+  // Free all memory
+  void deallocate() {
+    if (capacity > 0) {
+      __kmp_free(scheds);
+      __kmp_free(layers);
+      __kmp_free(small_chunks);
+      __kmp_free(large_chunks);
+      scheds = NULL;
+      layers = NULL;
+      small_chunks = NULL;
+      large_chunks = NULL;
+    }
+    size = 0;
+    capacity = 0;
+  }
+} kmp_hier_sched_env_t;
+
+extern int __kmp_dispatch_hand_threading;
+extern kmp_hier_sched_env_t __kmp_hier_scheds;
+
+// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
+extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+
+extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
+                                        kmp_hier_layer_e t2);
+extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
+
+template <typename T> struct kmp_hier_shared_bdata_t {
+  typedef typename traits_t<T>::signed_t ST;
+  volatile kmp_uint64 val[2];
+  kmp_int32 status[2];
+  T lb[2];
+  T ub[2];
+  ST st[2];
+  dispatch_shared_info_template<T> sh[2];
+  void zero() {
+    val[0] = val[1] = 0;
+    status[0] = status[1] = 0;
+    lb[0] = lb[1] = 0;
+    ub[0] = ub[1] = 0;
+    st[0] = st[1] = 0;
+    sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
+  }
+  void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
+                            kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+  }
+  void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+    sh[1 - index].u.s.iteration = 0;
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return status[1 - index];
+  }
+  T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
+  T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
+  ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return &(sh[1 - index]);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
+  T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
+  T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
+  ST get_curr_st(kmp_uint64 index) const { return st[index]; }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return &(sh[index]);
+  }
+};
+
+/*
+ * In the barrier implementations, num_active is the number of threads that are
+ * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
+ * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
+ * structure. tdata is the thread private data that resides on the thread
+ * data structure.
+ *
+ * The reset_shared() method is used to initialize the barrier data on the
+ * kmp_hier_top_unit_t hierarchy structure
+ *
+ * The reset_private() method is used to initialize the barrier data on the
+ * thread's private dispatch buffer structure
+ *
+ * The barrier() method takes an id, which is that thread's id for the
+ * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
+ * inside barrier() until all fellow threads who are attached to that
+ * kmp_hier_top_unit_t structure have arrived.
+ */
+
+// Core barrier implementation
+// Can be used in a unit with between 2 to 8 threads
+template <typename T> class core_barrier_impl {
+  static inline kmp_uint64 get_wait_val(int num_active) {
+    kmp_uint64 wait_val = 0LL;
+    switch (num_active) {
+    case 2:
+      wait_val = 0x0101LL;
+      break;
+    case 3:
+      wait_val = 0x010101LL;
+      break;
+    case 4:
+      wait_val = 0x01010101LL;
+      break;
+    case 5:
+      wait_val = 0x0101010101LL;
+      break;
+    case 6:
+      wait_val = 0x010101010101LL;
+      break;
+    case 7:
+      wait_val = 0x01010101010101LL;
+      break;
+    case 8:
+      wait_val = 0x0101010101010101LL;
+      break;
+    default:
+      // don't use the core_barrier_impl for more than 8 threads
+      KMP_ASSERT(0);
+    }
+    return wait_val;
+  }
+
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                         kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
+}
+template <typename T>
+void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                        kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void core_barrier_impl<T>::barrier(kmp_int32 id,
+                                   kmp_hier_shared_bdata_t<T> *bdata,
+                                   kmp_hier_private_bdata_t *tdata) {
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value =
+      (current_wait_value ? 0 : get_wait_val(tdata->num_active));
+  KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  char v = (current_wait_value ? 0x1 : 0x0);
+  (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
+  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                         __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Counter barrier implementation
+// Can be used in a unit with arbitrary number of active threads
+template <typename T> class counter_barrier_impl {
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                            kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
+}
+template <typename T>
+void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                           kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void counter_barrier_impl<T>::barrier(kmp_int32 id,
+                                      kmp_hier_shared_bdata_t<T> *bdata,
+                                      kmp_hier_private_bdata_t *tdata) {
+  volatile kmp_int64 *val;
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
+
+  KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
+  KMP_TEST_THEN_INC64(val);
+  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                         __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Data associated with topology unit within a layer
+// For example, one kmp_hier_top_unit_t corresponds to one L1 cache
+template <typename T> struct kmp_hier_top_unit_t {
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::unsigned_t UT;
+  kmp_int32 active; // number of topology units that communicate with this unit
+  // chunk information (lower/upper bound, stride, etc.)
+  dispatch_private_info_template<T> hier_pr;
+  kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
+  kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
+
+  kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
+  void reset_shared_barrier() {
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    hier_barrier.zero();
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    } else {
+      counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    }
+  }
+  void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_private(active, tdata);
+    } else {
+      counter_barrier_impl<T>::reset_private(active, tdata);
+    }
+  }
+  void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    KMP_DEBUG_ASSERT(id >= 0 && id < active);
+    if (active == 1) {
+      tdata->index = 1 - tdata->index;
+      return;
+    }
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    } else {
+      counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    }
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return hier_barrier.get_next_status(index);
+  }
+  T get_next_lb(kmp_uint64 index) const {
+    return hier_barrier.get_next_lb(index);
+  }
+  T get_next_ub(kmp_uint64 index) const {
+    return hier_barrier.get_next_ub(index);
+  }
+  ST get_next_st(kmp_uint64 index) const {
+    return hier_barrier.get_next_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return hier_barrier.get_next_sh(index);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const {
+    return hier_barrier.get_curr_status(index);
+  }
+  T get_curr_lb(kmp_uint64 index) const {
+    return hier_barrier.get_curr_lb(index);
+  }
+  T get_curr_ub(kmp_uint64 index) const {
+    return hier_barrier.get_curr_ub(index);
+  }
+  ST get_curr_st(kmp_uint64 index) const {
+    return hier_barrier.get_curr_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return hier_barrier.get_curr_sh(index);
+  }
+
+  void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
+                            kmp_uint64 index) {
+    hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
+  }
+  void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
+    hier_barrier.set_next(lb, ub, st, status, index);
+  }
+  dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+  dispatch_private_info_template<T> *get_parent_pr() {
+    return &(hier_parent->hier_pr);
+  }
+
+  kmp_int32 is_active() const { return active; }
+  kmp_int32 get_num_active() const { return active; }
+#ifdef KMP_DEBUG
+  void print() {
+    KD_TRACE(
+        10,
+        ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
+         active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
+         hier_pr.u.p.tc));
+  }
+#endif
+};
+
+// Information regarding a single layer within the scheduling hierarchy
+template <typename T> struct kmp_hier_layer_info_t {
+  int num_active; // number of threads active in this level
+  kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
+  enum sched_type sched; // static, dynamic, guided, etc.
+  typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
+  int length; // length of the kmp_hier_top_unit_t array
+
+#ifdef KMP_DEBUG
+  // Print this layer's information
+  void print() {
+    const char *t = __kmp_get_hier_str(type);
+    KD_TRACE(
+        10,
+        ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
+         "length:%d\n",
+         num_active, t, sched, chunk, length));
+  }
+#endif
+};
+
+/*
+ * Structure to implement entire hierarchy
+ *
+ * The hierarchy is kept as an array of arrays to represent the different
+ * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
+ * highest layer.
+ * Example:
+ * [ 2 ] -> [ L3 | L3 ]
+ * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
+ * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
+ * There is also an array of layer_info_t which has information regarding
+ * each layer
+ */
+template <typename T> struct kmp_hier_t {
+public:
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+
+private:
+  int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
+                   kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
+                   kmp_int32 previous_id, int hier_level) {
+    int status;
+    kmp_info_t *th = __kmp_threads[gtid];
+    auto parent = current->get_parent();
+    bool last_layer = (hier_level == get_num_layers() - 1);
+    KMP_DEBUG_ASSERT(th);
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
+    KMP_DEBUG_ASSERT(current);
+    KMP_DEBUG_ASSERT(hier_level >= 0);
+    KMP_DEBUG_ASSERT(hier_level < get_num_layers());
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent || last_layer);
+
+    KD_TRACE(
+        1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
+
+    T hier_id = (T)current->get_hier_id();
+    // Attempt to grab next iteration range for this level
+    if (previous_id == 0) {
+      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
+                   gtid, hier_level));
+      kmp_int32 contains_last;
+      T my_lb, my_ub;
+      ST my_st;
+      T nproc;
+      dispatch_shared_info_template<T> volatile *my_sh;
+      dispatch_private_info_template<T> *my_pr;
+      if (last_layer) {
+        // last layer below the very top uses the single shared buffer
+        // from the team struct.
+        KD_TRACE(10,
+                 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
+                  gtid, hier_level));
+        my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+        nproc = (T)get_top_level_nproc();
+      } else {
+        // middle layers use the shared buffer inside the kmp_hier_top_unit_t
+        // structure
+        KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
+                      gtid, hier_level));
+        my_sh =
+            parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
+        nproc = (T)parent->get_num_active();
+      }
+      my_pr = current->get_my_pr();
+      KMP_DEBUG_ASSERT(my_sh);
+      KMP_DEBUG_ASSERT(my_pr);
+      enum sched_type schedule = get_sched(hier_level);
+      ST chunk = (ST)get_chunk(hier_level);
+      status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
+                                                &contains_last, &my_lb, &my_ub,
+                                                &my_st, nproc, hier_id);
+      KD_TRACE(
+          10,
+          ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
+           gtid, hier_level, status));
+      // When no iterations are found (status == 0) and this is not the last
+      // layer, attempt to go up the hierarchy for more iterations
+      if (status == 0 && !last_layer) {
+        status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
+                              &my_st, hier_id, hier_level + 1);
+        KD_TRACE(
+            10,
+            ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
+             gtid, hier_level, status));
+        if (status == 1) {
+          kmp_hier_private_bdata_t *upper_tdata =
+              &(th->th.th_hier_bar_data[hier_level + 1]);
+          my_sh = parent->get_curr_sh(upper_tdata->index);
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
+                        gtid, hier_level));
+          __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
+                                        parent->get_curr_lb(upper_tdata->index),
+                                        parent->get_curr_ub(upper_tdata->index),
+                                        parent->get_curr_st(upper_tdata->index),
+#if USE_ITT_BUILD
+                                        NULL,
+#endif
+                                        chunk, nproc, hier_id);
+          status = __kmp_dispatch_next_algorithm<T>(
+              gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
+              hier_id);
+          if (!status) {
+            KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
+                          "setting to 2!\n",
+                          gtid, hier_level));
+            status = 2;
+          }
+        }
+      }
+      current->set_next(my_lb, my_ub, my_st, status, tdata->index);
+      // Propagate whether a unit holds the actual global last iteration
+      // The contains_last attribute is sent downwards from the top to the
+      // bottom of the hierarchy via the contains_last flag inside the
+      // private dispatch buffers in the hierarchy's middle layers
+      if (contains_last) {
+        // If the next_algorithm() method returns 1 for p_last and it is the
+        // last layer or our parent contains the last serial chunk, then the
+        // chunk must contain the last serial iteration.
+        if (last_layer || parent->hier_pr.flags.contains_last) {
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
+                        "to contain last.\n",
+                        gtid, hier_level));
+          current->hier_pr.flags.contains_last = contains_last;
+        }
+        if (!current->hier_pr.flags.contains_last)
+          contains_last = FALSE;
+      }
+      if (p_last)
+        *p_last = contains_last;
+    } // if master thread of this unit
+    if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
+                gtid, hier_level));
+      current->barrier(previous_id, tdata);
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
+                gtid, hier_level, current->get_curr_status(tdata->index)));
+    } else {
+      KMP_DEBUG_ASSERT(previous_id == 0);
+      return status;
+    }
+    return current->get_curr_status(tdata->index);
+  }
+
+public:
+  int top_level_nproc;
+  int num_layers;
+  bool valid;
+  int type_size;
+  kmp_hier_layer_info_t<T> *info;
+  kmp_hier_top_unit_t<T> **layers;
+  // Deallocate all memory from this hierarchy
+  void deallocate() {
+    for (int i = 0; i < num_layers; ++i)
+      if (layers[i] != NULL) {
+        __kmp_free(layers[i]);
+      }
+    if (layers != NULL) {
+      __kmp_free(layers);
+      layers = NULL;
+    }
+    if (info != NULL) {
+      __kmp_free(info);
+      info = NULL;
+    }
+    num_layers = 0;
+    valid = false;
+  }
+  // Returns true if reallocation is needed else false
+  bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
+                          const enum sched_type *new_scheds,
+                          const ST *new_chunks) const {
+    if (!valid || layers == NULL || info == NULL ||
+        traits_t<T>::type_size != type_size || n != num_layers)
+      return true;
+    for (int i = 0; i < n; ++i) {
+      if (info[i].type != new_layers[i])
+        return true;
+      if (info[i].sched != new_scheds[i])
+        return true;
+      if (info[i].chunk != new_chunks[i])
+        return true;
+    }
+    return false;
+  }
+  // A single thread should call this function while the other threads wait
+  // create a new scheduling hierarchy consisting of new_layers, new_scheds
+  // and new_chunks.  These should come pre-sorted according to
+  // kmp_hier_layer_e value.  This function will try to avoid reallocation
+  // if it can
+  void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
+                     const enum sched_type *new_scheds, const ST *new_chunks) {
+    top_level_nproc = 0;
+    if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
+      KD_TRACE(
+          10,
+          ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
+      for (int i = 0; i < n; ++i) {
+        info[i].num_active = 0;
+        for (int j = 0; j < get_length(i); ++j)
+          layers[i][j].active = 0;
+      }
+      return;
+    }
+    KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
+    deallocate();
+    type_size = traits_t<T>::type_size;
+    num_layers = n;
+    info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
+        sizeof(kmp_hier_layer_info_t<T>) * n);
+    layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
+        sizeof(kmp_hier_top_unit_t<T> *) * n);
+    for (int i = 0; i < n; ++i) {
+      int max = 0;
+      kmp_hier_layer_e layer = new_layers[i];
+      info[i].num_active = 0;
+      info[i].type = layer;
+      info[i].sched = new_scheds[i];
+      info[i].chunk = new_chunks[i];
+      max = __kmp_hier_max_units[layer + 1];
+      if (max == 0) {
+        valid = false;
+        KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
+        deallocate();
+        return;
+      }
+      info[i].length = max;
+      layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
+          sizeof(kmp_hier_top_unit_t<T>) * max);
+      for (int j = 0; j < max; ++j) {
+        layers[i][j].active = 0;
+        layers[i][j].hier_pr.flags.use_hier = TRUE;
+      }
+    }
+    valid = true;
+  }
+  // loc - source file location
+  // gtid - global thread identifier
+  // pr - this thread's private dispatch buffer (corresponding with gtid)
+  // p_last (return value) - pointer to flag indicating this set of iterations
+  // contains last
+  //          iteration
+  // p_lb (return value) - lower bound for this chunk of iterations
+  // p_ub (return value) - upper bound for this chunk of iterations
+  // p_st (return value) - stride for this chunk of iterations
+  //
+  // Returns 1 if there are more iterations to perform, 0 otherwise
+  int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
+           kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
+    int status;
+    kmp_int32 contains_last = 0;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
+    auto parent = pr->get_parent();
+    KMP_DEBUG_ASSERT(parent);
+    KMP_DEBUG_ASSERT(th);
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent);
+    T nproc = (T)parent->get_num_active();
+    T unit_id = (T)pr->get_hier_id();
+    KD_TRACE(
+        10,
+        ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
+         gtid, nproc, unit_id));
+    // Handthreading implementation
+    // Each iteration is performed by all threads on last unit (typically
+    // cores/tiles)
+    // e.g., threads 0,1,2,3 all execute iteration 0
+    //       threads 0,1,2,3 all execute iteration 1
+    //       threads 4,5,6,7 all execute iteration 2
+    //       threads 4,5,6,7 all execute iteration 3
+    //       ... etc.
+    if (__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
+                gtid));
+      if (unit_id == 0) {
+        // For hand threading, the sh buffer on the lowest level is only ever
+        // modified and read by the master thread on that level.  Because of
+        // this, we can always use the first sh buffer.
+        auto sh = &(parent->hier_barrier.sh[0]);
+        KMP_DEBUG_ASSERT(sh);
+        status = __kmp_dispatch_next_algorithm<T>(
+            gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+        if (!status) {
+          bool done = false;
+          while (!done) {
+            done = true;
+            status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                  p_st, unit_id, 0);
+            if (status == 1) {
+              __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                            parent->get_next_lb(tdata->index),
+                                            parent->get_next_ub(tdata->index),
+                                            parent->get_next_st(tdata->index),
+#if USE_ITT_BUILD
+                                            NULL,
+#endif
+                                            pr->u.p.parm1, nproc, unit_id);
+              sh->u.s.iteration = 0;
+              status = __kmp_dispatch_next_algorithm<T>(
+                  gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
+                  unit_id);
+              if (!status) {
+                KD_TRACE(10,
+                         ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                          "after next_pr_sh()"
+                          "trying again.\n",
+                          gtid));
+                done = false;
+              }
+            } else if (status == 2) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          }
+        }
+        parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
+      } // if master thread of lowest unit level
+      parent->barrier(pr->get_hier_id(), tdata);
+      if (unit_id != 0) {
+        *p_lb = parent->get_curr_lb(tdata->index);
+        *p_ub = parent->get_curr_ub(tdata->index);
+        *p_st = parent->get_curr_st(tdata->index);
+        status = parent->get_curr_status(tdata->index);
+      }
+    } else {
+      // Normal implementation
+      // Each thread grabs an iteration chunk and executes it (no cooperation)
+      auto sh = parent->get_curr_sh(tdata->index);
+      KMP_DEBUG_ASSERT(sh);
+      status = __kmp_dispatch_next_algorithm<T>(
+          gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
+                "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
+                gtid, status, contains_last, *p_lb, *p_ub, *p_st));
+      if (!status) {
+        bool done = false;
+        while (!done) {
+          done = true;
+          status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                p_st, unit_id, 0);
+          if (status == 1) {
+            sh = parent->get_curr_sh(tdata->index);
+            __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                          parent->get_curr_lb(tdata->index),
+                                          parent->get_curr_ub(tdata->index),
+                                          parent->get_curr_st(tdata->index),
+#if USE_ITT_BUILD
+                                          NULL,
+#endif
+                                          pr->u.p.parm1, nproc, unit_id);
+            status = __kmp_dispatch_next_algorithm<T>(
+                gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+            if (!status) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                            "after next_pr_sh()"
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          } else if (status == 2) {
+            KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                          "trying again.\n",
+                          gtid));
+            done = false;
+          }
+        }
+      }
+    }
+    if (contains_last && !parent->hier_pr.flags.contains_last) {
+      KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
+                    "contains_last to FALSE\n",
+                    gtid));
+      contains_last = FALSE;
+    }
+    if (p_last)
+      *p_last = contains_last;
+    KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
+                  status));
+    return status;
+  }
+  // These functions probe the layer info structure
+  // Returns the type of topology unit given level
+  kmp_hier_layer_e get_type(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].type;
+  }
+  // Returns the schedule type at given level
+  enum sched_type get_sched(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].sched;
+  }
+  // Returns the chunk size at given level
+  ST get_chunk(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].chunk;
+  }
+  // Returns the number of active threads at given level
+  int get_num_active(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].num_active;
+  }
+  // Returns the length of topology unit array at given level
+  int get_length(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].length;
+  }
+  // Returns the topology unit given the level and index
+  kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    KMP_DEBUG_ASSERT(index >= 0);
+    KMP_DEBUG_ASSERT(index < get_length(level));
+    return &(layers[level][index]);
+  }
+  // Returns the number of layers in the hierarchy
+  int get_num_layers() const { return num_layers; }
+  // Returns the number of threads in the top layer
+  // This is necessary because we don't store a topology unit as
+  // the very top level and the scheduling algorithms need this information
+  int get_top_level_nproc() const { return top_level_nproc; }
+  // Return whether this hierarchy is valid or not
+  bool is_valid() const { return valid; }
+#ifdef KMP_DEBUG
+  // Print the hierarchy
+  void print() {
+    KD_TRACE(10, ("kmp_hier_t:\n"));
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Info[%d] = ", i));
+      info[i].print();
+    }
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Layer[%d] =\n", i));
+      for (int j = 0; j < info[i].length; ++j) {
+        layers[i][j].print();
+      }
+    }
+  }
+#endif
+};
+
+template <typename T>
+void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
+                                   kmp_hier_layer_e *new_layers,
+                                   enum sched_type *new_scheds,
+                                   typename traits_t<T>::signed_t *new_chunks,
+                                   T lb, T ub,
+                                   typename traits_t<T>::signed_t st) {
+  int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
+  int my_buffer_index;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+  KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
+                gtid, n));
+  for (int i = 0; i < n; ++i) {
+    const char *layer = __kmp_get_hier_str(new_layers[i]);
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
+                  "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
+                  gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
+  }
+#endif // KMP_DEBUG
+  KMP_DEBUG_ASSERT(n > 0);
+  KMP_DEBUG_ASSERT(new_layers);
+  KMP_DEBUG_ASSERT(new_scheds);
+  KMP_DEBUG_ASSERT(new_chunks);
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+  num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+      &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  if (!active) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
+                  "Using normal dispatch functions.\n",
+                  gtid));
+    KMP_DEBUG_ASSERT(pr);
+    pr->flags.use_hier = FALSE;
+    pr->flags.contains_last = FALSE;
+    return;
+  }
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  pr->flags.use_hier = TRUE;
+  pr->u.p.tc = 0;
+  // Have master allocate the hierarchy
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
+                  "hierarchy\n",
+                  gtid, pr, sh));
+    if (sh->hier == NULL) {
+      sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
+    }
+    sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
+    sh->u.s.iteration = 0;
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  // Check to make sure the hierarchy is valid
+  kmp_hier_t<T> *hier = sh->hier;
+  if (!sh->hier->is_valid()) {
+    pr->flags.use_hier = FALSE;
+    return;
+  }
+  // Have threads allocate their thread-private barrier data if it hasn't
+  // already been allocated
+  if (th->th.th_hier_bar_data == NULL) {
+    th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
+        sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
+  }
+  // Have threads "register" themselves by modifiying the active count for each
+  // level they are involved in. The active count will act as nthreads for that
+  // level regarding the scheduling algorithms
+  for (int i = 0; i < n; ++i) {
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Setup the thread's private dispatch buffer's hierarchy pointers
+    if (i == 0)
+      pr->hier_parent = my_unit;
+    // If this unit is already active, then increment active count and wait
+    if (my_unit->is_active()) {
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "is already active (%d)\n",
+                    gtid, my_unit, my_unit->active));
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+    // Flag that this unit is active
+    if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
+      // Do not setup parent pointer for top level unit since it has no parent
+      if (i < n - 1) {
+        // Setup middle layer pointers to parents
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 hier->get_type(i + 1));
+        int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
+        my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
+      } else {
+        // Setup top layer information (no parent pointers are set)
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 kmp_hier_layer_e::LAYER_LOOP);
+        KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
+        my_unit->hier_parent = nullptr;
+      }
+      // Set trip count to 0 so that next() operation will initially climb up
+      // the hierarchy to get more iterations (early exit in next() for tc == 0)
+      my_unit->get_my_pr()->u.p.tc = 0;
+      // Increment this layer's number of active units
+      KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "incrementing num_active\n",
+                    gtid, my_unit));
+    } else {
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+  }
+  // Set this thread's id
+  num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
+      kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
+  pr->hier_id = tid % num_threads_per_layer1;
+  // For oversubscribed threads, increment their index within the lowest unit
+  // This is done to prevent having two or more threads with id 0, id 1, etc.
+  if (tid >= num_hw_threads)
+    pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
+  KD_TRACE(
+      10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
+           gtid, pr->hier_id));
+
+  pr->flags.contains_last = FALSE;
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  // Now that the number of active threads at each level is determined,
+  // the barrier data for each unit can be initialized and the last layer's
+  // loop information can be initialized.
+  int prev_id = pr->get_hier_id();
+  for (int i = 0; i < n; ++i) {
+    if (prev_id != 0)
+      break;
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Only master threads of this unit within the hierarchy do initialization
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
+                  gtid, i));
+    my_unit->reset_shared_barrier();
+    my_unit->hier_pr.flags.contains_last = FALSE;
+    // Last layer, initialize the private buffers with entire loop information
+    // Now the next next_algorithim() call will get the first chunk of
+    // iterations properly
+    if (i == n - 1) {
+      __kmp_dispatch_init_algorithm<T>(
+          loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
+#if USE_ITT_BUILD
+          NULL,
+#endif
+          hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
+    }
+    prev_id = my_unit->get_hier_id();
+  }
+  // Initialize each layer of the thread's private barrier data
+  kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
+  for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
+    unit->reset_private_barrier(tdata);
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+#ifdef KMP_DEBUG
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    for (int i = 0; i < n; ++i) {
+      KD_TRACE(10,
+               ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
+                gtid, i, hier->get_num_active(i)));
+    }
+    hier->print();
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#endif // KMP_DEBUG
+}
+#endif
diff --git a/final/runtime/src/kmp_environment.cpp b/final/runtime/src/kmp_environment.cpp
new file mode 100644
index 0000000..51bc3cf
--- /dev/null
+++ b/final/runtime/src/kmp_environment.cpp
@@ -0,0 +1,500 @@
+/*
+ * kmp_environment.cpp -- Handle environment variables OS-independently.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* We use GetEnvironmentVariable for Windows* OS instead of getenv because the
+   act of loading a DLL on Windows* OS makes any user-set environment variables
+   (i.e. with putenv()) unavailable.  getenv() apparently gets a clean copy of
+   the env variables as they existed at the start of the run. JH 12/23/2002
+
+   On Windows* OS, there are two environments (at least, see below):
+
+   1. Environment maintained by Windows* OS on IA-32 architecture. Accessible
+      through GetEnvironmentVariable(), SetEnvironmentVariable(), and
+      GetEnvironmentStrings().
+
+   2. Environment maintained by C RTL. Accessible through getenv(), putenv().
+
+   putenv() function updates both C and Windows* OS on IA-32 architecture.
+   getenv() function search for variables in C RTL environment only.
+   Windows* OS on IA-32 architecture functions work *only* with Windows* OS on
+   IA-32 architecture.
+
+   Windows* OS on IA-32 architecture maintained by OS, so there is always only
+   one Windows* OS on IA-32 architecture per process. Changes in Windows* OS on
+   IA-32 architecture are process-visible.
+
+   C environment maintained by C RTL. Multiple copies of C RTL may be present
+   in the process, and each C RTL maintains its own environment. :-(
+
+   Thus, proper way to work with environment on Windows* OS is:
+
+   1. Set variables with putenv() function -- both C and Windows* OS on IA-32
+      architecture are being updated. Windows* OS on IA-32 architecture may be
+      considered primary target, while updating C RTL environment is free bonus.
+
+   2. Get variables with GetEnvironmentVariable() -- getenv() does not
+      search Windows* OS on IA-32 architecture, and can not see variables
+      set with SetEnvironmentVariable().
+
+   2007-04-05 -- lev
+*/
+
+#include "kmp_environment.h"
+
+#include "kmp.h" //
+#include "kmp_i18n.h"
+#include "kmp_os.h" // KMP_OS_*.
+#include "kmp_str.h" // __kmp_str_*().
+
+#if KMP_OS_UNIX
+#include <stdlib.h> // getenv, setenv, unsetenv.
+#include <string.h> // strlen, strcpy.
+#if KMP_OS_DARWIN
+#include <crt_externs.h>
+#define environ (*_NSGetEnviron())
+#else
+extern char **environ;
+#endif
+#elif KMP_OS_WINDOWS
+#include <windows.h> // GetEnvironmentVariable, SetEnvironmentVariable,
+// GetLastError.
+#else
+#error Unknown or unsupported OS.
+#endif
+
+// TODO: Eliminate direct memory allocations, use string operations instead.
+
+static inline void *allocate(size_t size) {
+  void *ptr = KMP_INTERNAL_MALLOC(size);
+  if (ptr == NULL) {
+    KMP_FATAL(MemoryAllocFailed);
+  }
+  return ptr;
+} // allocate
+
+char *__kmp_env_get(char const *name) {
+
+  char *result = NULL;
+
+#if KMP_OS_UNIX
+  char const *value = getenv(name);
+  if (value != NULL) {
+    size_t len = KMP_STRLEN(value) + 1;
+    result = (char *)KMP_INTERNAL_MALLOC(len);
+    if (result == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    KMP_STRNCPY_S(result, len, value, len);
+  }
+#elif KMP_OS_WINDOWS
+  /* We use GetEnvironmentVariable for Windows* OS instead of getenv because the
+     act of loading a DLL on Windows* OS makes any user-set environment
+     variables (i.e. with putenv()) unavailable. getenv() apparently gets a
+     clean copy of the env variables as they existed at the start of the run.
+     JH 12/23/2002 */
+  DWORD rc;
+  rc = GetEnvironmentVariable(name, NULL, 0);
+  if (!rc) {
+    DWORD error = GetLastError();
+    if (error != ERROR_ENVVAR_NOT_FOUND) {
+      __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+    }
+    // Variable is not found, it's ok, just continue.
+  } else {
+    DWORD len = rc;
+    result = (char *)KMP_INTERNAL_MALLOC(len);
+    if (result == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    rc = GetEnvironmentVariable(name, result, len);
+    if (!rc) {
+      // GetEnvironmentVariable() may return 0 if variable is empty.
+      // In such a case GetLastError() returns ERROR_SUCCESS.
+      DWORD error = GetLastError();
+      if (error != ERROR_SUCCESS) {
+        // Unexpected error. The variable should be in the environment,
+        // and buffer should be large enough.
+        __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error),
+                    __kmp_msg_null);
+        KMP_INTERNAL_FREE((void *)result);
+        result = NULL;
+      }
+    }
+  }
+#else
+#error Unknown or unsupported OS.
+#endif
+
+  return result;
+
+} // func __kmp_env_get
+
+// TODO: Find and replace all regular free() with __kmp_env_free().
+
+void __kmp_env_free(char const **value) {
+
+  KMP_DEBUG_ASSERT(value != NULL);
+  KMP_INTERNAL_FREE(CCAST(char *, *value));
+  *value = NULL;
+
+} // func __kmp_env_free
+
+int __kmp_env_exists(char const *name) {
+
+#if KMP_OS_UNIX
+  char const *value = getenv(name);
+  return ((value == NULL) ? (0) : (1));
+#elif KMP_OS_WINDOWS
+  DWORD rc;
+  rc = GetEnvironmentVariable(name, NULL, 0);
+  if (rc == 0) {
+    DWORD error = GetLastError();
+    if (error != ERROR_ENVVAR_NOT_FOUND) {
+      __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+    }
+    return 0;
+  }
+  return 1;
+#else
+#error Unknown or unsupported OS.
+#endif
+
+} // func __kmp_env_exists
+
+void __kmp_env_set(char const *name, char const *value, int overwrite) {
+
+#if KMP_OS_UNIX
+  int rc = setenv(name, value, overwrite);
+  if (rc != 0) {
+    // Dead code. I tried to put too many variables into Linux* OS
+    // environment on IA-32 architecture. When application consumes
+    // more than ~2.5 GB of memory, entire system feels bad. Sometimes
+    // application is killed (by OS?), sometimes system stops
+    // responding... But this error message never appears. --ln
+    __kmp_fatal(KMP_MSG(CantSetEnvVar, name), KMP_HNT(NotEnoughMemory),
+                __kmp_msg_null);
+  }
+#elif KMP_OS_WINDOWS
+  BOOL rc;
+  if (!overwrite) {
+    rc = GetEnvironmentVariable(name, NULL, 0);
+    if (rc) {
+      // Variable exists, do not overwrite.
+      return;
+    }
+    DWORD error = GetLastError();
+    if (error != ERROR_ENVVAR_NOT_FOUND) {
+      __kmp_fatal(KMP_MSG(CantGetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+    }
+  }
+  rc = SetEnvironmentVariable(name, value);
+  if (!rc) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantSetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+  }
+#else
+#error Unknown or unsupported OS.
+#endif
+
+} // func __kmp_env_set
+
+void __kmp_env_unset(char const *name) {
+
+#if KMP_OS_UNIX
+  unsetenv(name);
+#elif KMP_OS_WINDOWS
+  BOOL rc = SetEnvironmentVariable(name, NULL);
+  if (!rc) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantSetEnvVar, name), KMP_ERR(error), __kmp_msg_null);
+  }
+#else
+#error Unknown or unsupported OS.
+#endif
+
+} // func __kmp_env_unset
+
+/* Intel OpenMP RTL string representation of environment: just a string of
+   characters, variables are separated with vertical bars, e. g.:
+
+        "KMP_WARNINGS=0|KMP_AFFINITY=compact|"
+
+    Empty variables are allowed and ignored:
+
+        "||KMP_WARNINGS=1||"
+*/
+
+static void
+___kmp_env_blk_parse_string(kmp_env_blk_t *block, // M: Env block to fill.
+                            char const *env // I: String to parse.
+                            ) {
+
+  char const chr_delimiter = '|';
+  char const str_delimiter[] = {chr_delimiter, 0};
+
+  char *bulk = NULL;
+  kmp_env_var_t *vars = NULL;
+  int count = 0; // Number of used elements in vars array.
+  int delimiters = 0; // Number of delimiters in input string.
+
+  // Copy original string, we will modify the copy.
+  bulk = __kmp_str_format("%s", env);
+
+  // Loop thru all the vars in environment block. Count delimiters (maximum
+  // number of variables is number of delimiters plus one).
+  {
+    char const *ptr = bulk;
+    for (;;) {
+      ptr = strchr(ptr, chr_delimiter);
+      if (ptr == NULL) {
+        break;
+      }
+      ++delimiters;
+      ptr += 1;
+    }
+  }
+
+  // Allocate vars array.
+  vars = (kmp_env_var_t *)allocate((delimiters + 1) * sizeof(kmp_env_var_t));
+
+  // Loop thru all the variables.
+  {
+    char *var; // Pointer to variable (both name and value).
+    char *name; // Pointer to name of variable.
+    char *value; // Pointer to value.
+    char *buf; // Buffer for __kmp_str_token() function.
+    var = __kmp_str_token(bulk, str_delimiter, &buf); // Get the first var.
+    while (var != NULL) {
+      // Save found variable in vars array.
+      __kmp_str_split(var, '=', &name, &value);
+      KMP_DEBUG_ASSERT(count < delimiters + 1);
+      vars[count].name = name;
+      vars[count].value = value;
+      ++count;
+      // Get the next var.
+      var = __kmp_str_token(NULL, str_delimiter, &buf);
+    }
+  }
+
+  // Fill out result.
+  block->bulk = bulk;
+  block->vars = vars;
+  block->count = count;
+}
+
+/* Windows* OS (actually, DOS) environment block is a piece of memory with
+   environment variables. Each variable is terminated with zero byte, entire
+   block is terminated with one extra zero byte, so we have two zero bytes at
+   the end of environment block, e. g.:
+
+        "HOME=C:\\users\\lev\x00OS=Windows_NT\x00\x00"
+
+    It is not clear how empty environment is represented. "\x00\x00"?
+*/
+
+#if KMP_OS_WINDOWS
+static void ___kmp_env_blk_parse_windows(
+    kmp_env_blk_t *block, // M: Env block to fill.
+    char const *env // I: Pointer to Windows* OS (DOS) environment block.
+    ) {
+
+  char *bulk = NULL;
+  kmp_env_var_t *vars = NULL;
+  int count = 0; // Number of used elements in vars array.
+  int size = 0; // Size of bulk.
+
+  char *name; // Pointer to name of variable.
+  char *value; // Pointer to value.
+
+  if (env != NULL) {
+
+    // Loop thru all the vars in environment block. Count variables, find size
+    // of block.
+    {
+      char const *var; // Pointer to beginning of var.
+      int len; // Length of variable.
+      count = 0;
+      var =
+          env; // The first variable starts and beginning of environment block.
+      len = KMP_STRLEN(var);
+      while (len != 0) {
+        ++count;
+        size = size + len + 1;
+        var = var + len +
+              1; // Move pointer to the beginning of the next variable.
+        len = KMP_STRLEN(var);
+      }
+      size =
+          size + 1; // Total size of env block, including terminating zero byte.
+    }
+
+    // Copy original block to bulk, we will modify bulk, not original block.
+    bulk = (char *)allocate(size);
+    KMP_MEMCPY_S(bulk, size, env, size);
+    // Allocate vars array.
+    vars = (kmp_env_var_t *)allocate(count * sizeof(kmp_env_var_t));
+
+    // Loop thru all the vars, now in bulk.
+    {
+      char *var; // Pointer to beginning of var.
+      int len; // Length of variable.
+      count = 0;
+      var = bulk;
+      len = KMP_STRLEN(var);
+      while (len != 0) {
+        // Save variable in vars array.
+        __kmp_str_split(var, '=', &name, &value);
+        vars[count].name = name;
+        vars[count].value = value;
+        ++count;
+        // Get the next var.
+        var = var + len + 1;
+        len = KMP_STRLEN(var);
+      }
+    }
+  }
+
+  // Fill out result.
+  block->bulk = bulk;
+  block->vars = vars;
+  block->count = count;
+}
+#endif
+
+/* Unix environment block is a array of pointers to variables, last pointer in
+   array is NULL:
+
+        { "HOME=/home/lev", "TERM=xterm", NULL }
+*/
+
+static void
+___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
+                          char **env // I: Unix environment to parse.
+                          ) {
+
+  char *bulk = NULL;
+  kmp_env_var_t *vars = NULL;
+  int count = 0;
+  int size = 0; // Size of bulk.
+
+  // Count number of variables and length of required bulk.
+  {
+    count = 0;
+    size = 0;
+    while (env[count] != NULL) {
+      size += KMP_STRLEN(env[count]) + 1;
+      ++count;
+    }
+  }
+
+  // Allocate memory.
+  bulk = (char *)allocate(size);
+  vars = (kmp_env_var_t *)allocate(count * sizeof(kmp_env_var_t));
+
+  // Loop thru all the vars.
+  {
+    char *var; // Pointer to beginning of var.
+    char *name; // Pointer to name of variable.
+    char *value; // Pointer to value.
+    int len; // Length of variable.
+    int i;
+    var = bulk;
+    for (i = 0; i < count; ++i) {
+      // Copy variable to bulk.
+      len = KMP_STRLEN(env[i]);
+      KMP_MEMCPY_S(var, size, env[i], len + 1);
+      // Save found variable in vars array.
+      __kmp_str_split(var, '=', &name, &value);
+      vars[i].name = name;
+      vars[i].value = value;
+      // Move pointer.
+      var += len + 1;
+    }
+  }
+
+  // Fill out result.
+  block->bulk = bulk;
+  block->vars = vars;
+  block->count = count;
+}
+
+void __kmp_env_blk_init(kmp_env_blk_t *block, // M: Block to initialize.
+                        char const *bulk // I: Initialization string, or NULL.
+                        ) {
+
+  if (bulk != NULL) {
+    ___kmp_env_blk_parse_string(block, bulk);
+  } else {
+#if KMP_OS_UNIX
+    ___kmp_env_blk_parse_unix(block, environ);
+#elif KMP_OS_WINDOWS
+    {
+      char *mem = GetEnvironmentStrings();
+      if (mem == NULL) {
+        DWORD error = GetLastError();
+        __kmp_fatal(KMP_MSG(CantGetEnvironment), KMP_ERR(error),
+                    __kmp_msg_null);
+      }
+      ___kmp_env_blk_parse_windows(block, mem);
+      FreeEnvironmentStrings(mem);
+    }
+#else
+#error Unknown or unsupported OS.
+#endif
+  }
+
+} // __kmp_env_blk_init
+
+static int ___kmp_env_var_cmp( // Comparison function for qsort().
+    kmp_env_var_t const *lhs, kmp_env_var_t const *rhs) {
+  return strcmp(lhs->name, rhs->name);
+}
+
+void __kmp_env_blk_sort(
+    kmp_env_blk_t *block // M: Block of environment variables to sort.
+    ) {
+
+  qsort(CCAST(kmp_env_var_t *, block->vars), block->count,
+        sizeof(kmp_env_var_t),
+        (int (*)(void const *, void const *)) & ___kmp_env_var_cmp);
+
+} // __kmp_env_block_sort
+
+void __kmp_env_blk_free(
+    kmp_env_blk_t *block // M: Block of environment variables to free.
+    ) {
+
+  KMP_INTERNAL_FREE(CCAST(kmp_env_var_t *, block->vars));
+  __kmp_str_free(&(block->bulk));
+
+  block->count = 0;
+  block->vars = NULL;
+
+} // __kmp_env_blk_free
+
+char const * // R: Value of variable or NULL if variable does not exist.
+    __kmp_env_blk_var(
+        kmp_env_blk_t *block, // I: Block of environment variables.
+        char const *name // I: Name of variable to find.
+        ) {
+
+  int i;
+  for (i = 0; i < block->count; ++i) {
+    if (strcmp(block->vars[i].name, name) == 0) {
+      return block->vars[i].value;
+    }
+  }
+  return NULL;
+
+} // __kmp_env_block_var
+
+// end of file //
diff --git a/final/runtime/src/kmp_environment.h b/final/runtime/src/kmp_environment.h
new file mode 100644
index 0000000..76a9672
--- /dev/null
+++ b/final/runtime/src/kmp_environment.h
@@ -0,0 +1,77 @@
+/*
+ * kmp_environment.h -- Handle environment varoiables OS-independently.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ENVIRONMENT_H
+#define KMP_ENVIRONMENT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Return a copy of the value of environment variable or NULL if the variable
+// does not exist.
+// *Note*: Returned pointed *must* be freed after use with __kmp_env_free().
+char *__kmp_env_get(char const *name);
+void __kmp_env_free(char const **value);
+
+// Return 1 if the environment variable exists or 0 if does not exist.
+int __kmp_env_exists(char const *name);
+
+// Set the environment variable.
+void __kmp_env_set(char const *name, char const *value, int overwrite);
+
+// Unset (remove) environment variable.
+void __kmp_env_unset(char const *name);
+
+// -----------------------------------------------------------------------------
+//  Working with environment blocks.
+
+/* kmp_env_blk_t is read-only collection of environment variables (or
+   environment-like). Usage:
+
+kmp_env_blk_t block;
+__kmp_env_blk_init( & block, NULL ); // Initialize block from process
+                                        // environment.
+// or
+__kmp_env_blk_init( & block, "KMP_WARNING=1|KMP_AFFINITY=none" ); // from string
+__kmp_env_blk_sort( & block ); // Optionally, sort list.
+for ( i = 0; i < block.count; ++ i ) {
+    // Process block.vars[ i ].name and block.vars[ i ].value...
+}
+__kmp_env_block_free( & block );
+*/
+
+struct __kmp_env_var {
+  char *name;
+  char *value;
+};
+typedef struct __kmp_env_var kmp_env_var_t;
+
+struct __kmp_env_blk {
+  char *bulk;
+  kmp_env_var_t *vars;
+  int count;
+};
+typedef struct __kmp_env_blk kmp_env_blk_t;
+
+void __kmp_env_blk_init(kmp_env_blk_t *block, char const *bulk);
+void __kmp_env_blk_free(kmp_env_blk_t *block);
+void __kmp_env_blk_sort(kmp_env_blk_t *block);
+char const *__kmp_env_blk_var(kmp_env_blk_t *block, char const *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KMP_ENVIRONMENT_H
+
+// end of file //
diff --git a/final/runtime/src/kmp_error.cpp b/final/runtime/src/kmp_error.cpp
new file mode 100644
index 0000000..b30b26e
--- /dev/null
+++ b/final/runtime/src/kmp_error.cpp
@@ -0,0 +1,448 @@
+/*
+ * kmp_error.cpp -- KPTS functions for error checking at runtime
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+
+/* ------------------------------------------------------------------------ */
+
+#define MIN_STACK 100
+
+static char const *cons_text_c[] = {
+    "(none)", "\"parallel\"", "work-sharing", /* this is not called "for"
+                                                 because of lowering of
+                                                 "sections" pragmas */
+    "\"ordered\" work-sharing", /* this is not called "for ordered" because of
+                                   lowering of "sections" pragmas */
+    "\"sections\"",
+    "work-sharing", /* this is not called "single" because of lowering of
+                       "sections" pragmas */
+    "\"critical\"", "\"ordered\"", /* in PARALLEL */
+    "\"ordered\"", /* in PDO */
+    "\"master\"", "\"reduce\"", "\"barrier\""};
+
+#define get_src(ident) ((ident) == NULL ? NULL : (ident)->psource)
+
+#define PUSH_MSG(ct, ident)                                                    \
+  "\tpushing on stack: %s (%s)\n", cons_text_c[(ct)], get_src((ident))
+#define POP_MSG(p)                                                             \
+  "\tpopping off stack: %s (%s)\n", cons_text_c[(p)->stack_data[tos].type],    \
+      get_src((p)->stack_data[tos].ident)
+
+static int const cons_text_c_num = sizeof(cons_text_c) / sizeof(char const *);
+
+/* --------------- START OF STATIC LOCAL ROUTINES ------------------------- */
+
+static void __kmp_check_null_func(void) { /* nothing to do */
+}
+
+static void __kmp_expand_cons_stack(int gtid, struct cons_header *p) {
+  int i;
+  struct cons_data *d;
+
+  /* TODO for monitor perhaps? */
+  if (gtid < 0)
+    __kmp_check_null_func();
+
+  KE_TRACE(10, ("expand cons_stack (%d %d)\n", gtid, __kmp_get_gtid()));
+
+  d = p->stack_data;
+
+  p->stack_size = (p->stack_size * 2) + 100;
+
+  /* TODO free the old data */
+  p->stack_data = (struct cons_data *)__kmp_allocate(sizeof(struct cons_data) *
+                                                     (p->stack_size + 1));
+
+  for (i = p->stack_top; i >= 0; --i)
+    p->stack_data[i] = d[i];
+
+  /* NOTE: we do not free the old stack_data */
+}
+
+// NOTE: Function returns allocated memory, caller must free it!
+static char *__kmp_pragma(int ct, ident_t const *ident) {
+  char const *cons = NULL; // Construct name.
+  char *file = NULL; // File name.
+  char *func = NULL; // Function (routine) name.
+  char *line = NULL; // Line number.
+  kmp_str_buf_t buffer;
+  kmp_msg_t prgm;
+  __kmp_str_buf_init(&buffer);
+  if (0 < ct && ct < cons_text_c_num) {
+    cons = cons_text_c[ct];
+  } else {
+    KMP_DEBUG_ASSERT(0);
+  }
+  if (ident != NULL && ident->psource != NULL) {
+    char *tail = NULL;
+    __kmp_str_buf_print(&buffer, "%s",
+                        ident->psource); // Copy source to buffer.
+    // Split string in buffer to file, func, and line.
+    tail = buffer.str;
+    __kmp_str_split(tail, ';', NULL, &tail);
+    __kmp_str_split(tail, ';', &file, &tail);
+    __kmp_str_split(tail, ';', &func, &tail);
+    __kmp_str_split(tail, ';', &line, &tail);
+  }
+  prgm = __kmp_msg_format(kmp_i18n_fmt_Pragma, cons, file, func, line);
+  __kmp_str_buf_free(&buffer);
+  return prgm.str;
+} // __kmp_pragma
+
+/* ----------------- END OF STATIC LOCAL ROUTINES ------------------------- */
+
+void __kmp_error_construct(kmp_i18n_id_t id, // Message identifier.
+                           enum cons_type ct, // Construct type.
+                           ident_t const *ident // Construct ident.
+                           ) {
+  char *construct = __kmp_pragma(ct, ident);
+  __kmp_fatal(__kmp_msg_format(id, construct), __kmp_msg_null);
+  KMP_INTERNAL_FREE(construct);
+}
+
+void __kmp_error_construct2(kmp_i18n_id_t id, // Message identifier.
+                            enum cons_type ct, // First construct type.
+                            ident_t const *ident, // First construct ident.
+                            struct cons_data const *cons // Second construct.
+                            ) {
+  char *construct1 = __kmp_pragma(ct, ident);
+  char *construct2 = __kmp_pragma(cons->type, cons->ident);
+  __kmp_fatal(__kmp_msg_format(id, construct1, construct2), __kmp_msg_null);
+  KMP_INTERNAL_FREE(construct1);
+  KMP_INTERNAL_FREE(construct2);
+}
+
+struct cons_header *__kmp_allocate_cons_stack(int gtid) {
+  struct cons_header *p;
+
+  /* TODO for monitor perhaps? */
+  if (gtid < 0) {
+    __kmp_check_null_func();
+  }
+  KE_TRACE(10, ("allocate cons_stack (%d)\n", gtid));
+  p = (struct cons_header *)__kmp_allocate(sizeof(struct cons_header));
+  p->p_top = p->w_top = p->s_top = 0;
+  p->stack_data = (struct cons_data *)__kmp_allocate(sizeof(struct cons_data) *
+                                                     (MIN_STACK + 1));
+  p->stack_size = MIN_STACK;
+  p->stack_top = 0;
+  p->stack_data[0].type = ct_none;
+  p->stack_data[0].prev = 0;
+  p->stack_data[0].ident = NULL;
+  return p;
+}
+
+void __kmp_free_cons_stack(void *ptr) {
+  struct cons_header *p = (struct cons_header *)ptr;
+  if (p != NULL) {
+    if (p->stack_data != NULL) {
+      __kmp_free(p->stack_data);
+      p->stack_data = NULL;
+    }
+    __kmp_free(p);
+  }
+}
+
+#if KMP_DEBUG
+static void dump_cons_stack(int gtid, struct cons_header *p) {
+  int i;
+  int tos = p->stack_top;
+  kmp_str_buf_t buffer;
+  __kmp_str_buf_init(&buffer);
+  __kmp_str_buf_print(
+      &buffer,
+      "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n");
+  __kmp_str_buf_print(&buffer,
+                      "Begin construct stack with %d items for thread %d\n",
+                      tos, gtid);
+  __kmp_str_buf_print(&buffer, "     stack_top=%d { P=%d, W=%d, S=%d }\n", tos,
+                      p->p_top, p->w_top, p->s_top);
+  for (i = tos; i > 0; i--) {
+    struct cons_data *c = &(p->stack_data[i]);
+    __kmp_str_buf_print(
+        &buffer, "        stack_data[%2d] = { %s (%s) %d %p }\n", i,
+        cons_text_c[c->type], get_src(c->ident), c->prev, c->name);
+  }
+  __kmp_str_buf_print(&buffer, "End construct stack for thread %d\n", gtid);
+  __kmp_str_buf_print(
+      &buffer,
+      "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n");
+  __kmp_debug_printf("%s", buffer.str);
+  __kmp_str_buf_free(&buffer);
+}
+#endif
+
+void __kmp_push_parallel(int gtid, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid]->th.th_cons);
+  KE_TRACE(10, ("__kmp_push_parallel (%d %d)\n", gtid, __kmp_get_gtid()));
+  KE_TRACE(100, (PUSH_MSG(ct_parallel, ident)));
+  if (p->stack_top >= p->stack_size) {
+    __kmp_expand_cons_stack(gtid, p);
+  }
+  tos = ++p->stack_top;
+  p->stack_data[tos].type = ct_parallel;
+  p->stack_data[tos].prev = p->p_top;
+  p->stack_data[tos].ident = ident;
+  p->stack_data[tos].name = NULL;
+  p->p_top = tos;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+void __kmp_check_workshare(int gtid, enum cons_type ct, ident_t const *ident) {
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid]->th.th_cons);
+  KE_TRACE(10, ("__kmp_check_workshare (%d %d)\n", gtid, __kmp_get_gtid()));
+
+  if (p->stack_top >= p->stack_size) {
+    __kmp_expand_cons_stack(gtid, p);
+  }
+  if (p->w_top > p->p_top) {
+    // We are already in a WORKSHARE construct for this PARALLEL region.
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->w_top]);
+  }
+  if (p->s_top > p->p_top) {
+    // We are already in a SYNC construct for this PARALLEL region.
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->s_top]);
+  }
+}
+
+void __kmp_push_workshare(int gtid, enum cons_type ct, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  KE_TRACE(10, ("__kmp_push_workshare (%d %d)\n", gtid, __kmp_get_gtid()));
+  __kmp_check_workshare(gtid, ct, ident);
+  KE_TRACE(100, (PUSH_MSG(ct, ident)));
+  tos = ++p->stack_top;
+  p->stack_data[tos].type = ct;
+  p->stack_data[tos].prev = p->w_top;
+  p->stack_data[tos].ident = ident;
+  p->stack_data[tos].name = NULL;
+  p->w_top = tos;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+void
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck, kmp_uint32 seq )
+#else
+__kmp_check_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck )
+#endif
+{
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KE_TRACE(10, ("__kmp_check_sync (gtid=%d)\n", __kmp_get_gtid()));
+
+  if (p->stack_top >= p->stack_size)
+    __kmp_expand_cons_stack(gtid, p);
+
+  if (ct == ct_ordered_in_parallel || ct == ct_ordered_in_pdo) {
+    if (p->w_top <= p->p_top) {
+/* we are not in a worksharing construct */
+#ifdef BUILD_PARALLEL_ORDERED
+      /* do not report error messages for PARALLEL ORDERED */
+      KMP_ASSERT(ct == ct_ordered_in_parallel);
+#else
+      __kmp_error_construct(kmp_i18n_msg_CnsBoundToWorksharing, ct, ident);
+#endif /* BUILD_PARALLEL_ORDERED */
+    } else {
+      /* inside a WORKSHARING construct for this PARALLEL region */
+      if (!IS_CONS_TYPE_ORDERED(p->stack_data[p->w_top].type)) {
+        __kmp_error_construct2(kmp_i18n_msg_CnsNoOrderedClause, ct, ident,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+    if (p->s_top > p->p_top && p->s_top > p->w_top) {
+      /* inside a sync construct which is inside a worksharing construct */
+      int index = p->s_top;
+      enum cons_type stack_type;
+
+      stack_type = p->stack_data[index].type;
+
+      if (stack_type == ct_critical ||
+          ((stack_type == ct_ordered_in_parallel ||
+            stack_type == ct_ordered_in_pdo) &&
+           /* C doesn't allow named ordered; ordered in ordered gets error */
+           p->stack_data[index].ident != NULL &&
+           (p->stack_data[index].ident->flags & KMP_IDENT_KMPC))) {
+        /* we are in ORDERED which is inside an ORDERED or CRITICAL construct */
+        __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                               &p->stack_data[index]);
+      }
+    }
+  } else if (ct == ct_critical) {
+#if KMP_USE_DYNAMIC_LOCK
+    if (lck != NULL &&
+        __kmp_get_user_lock_owner(lck, seq) ==
+            gtid) { /* this thread already has lock for this critical section */
+#else
+    if (lck != NULL &&
+        __kmp_get_user_lock_owner(lck) ==
+            gtid) { /* this thread already has lock for this critical section */
+#endif
+      int index = p->s_top;
+      struct cons_data cons = {NULL, ct_critical, 0, NULL};
+      /* walk up construct stack and try to find critical with matching name */
+      while (index != 0 && p->stack_data[index].name != lck) {
+        index = p->stack_data[index].prev;
+      }
+      if (index != 0) {
+        /* found match on the stack (may not always because of interleaved
+         * critical for Fortran) */
+        cons = p->stack_data[index];
+      }
+      /* we are in CRITICAL which is inside a CRITICAL construct of same name */
+      __kmp_error_construct2(kmp_i18n_msg_CnsNestingSameName, ct, ident, &cons);
+    }
+  } else if (ct == ct_master || ct == ct_reduce) {
+    if (p->w_top > p->p_top) {
+      /* inside a WORKSHARING construct for this PARALLEL region */
+      __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                             &p->stack_data[p->w_top]);
+    }
+    if (ct == ct_reduce && p->s_top > p->p_top) {
+      /* inside a another SYNC construct for this PARALLEL region */
+      __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                             &p->stack_data[p->s_top]);
+    }
+  }
+}
+
+void
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck, kmp_uint32 seq )
+#else
+__kmp_push_sync( int gtid, enum cons_type ct, ident_t const * ident, kmp_user_lock_p lck )
+#endif
+{
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  KMP_ASSERT(gtid == __kmp_get_gtid());
+  KE_TRACE(10, ("__kmp_push_sync (gtid=%d)\n", gtid));
+#if KMP_USE_DYNAMIC_LOCK
+  __kmp_check_sync(gtid, ct, ident, lck, seq);
+#else
+  __kmp_check_sync(gtid, ct, ident, lck);
+#endif
+  KE_TRACE(100, (PUSH_MSG(ct, ident)));
+  tos = ++p->stack_top;
+  p->stack_data[tos].type = ct;
+  p->stack_data[tos].prev = p->s_top;
+  p->stack_data[tos].ident = ident;
+  p->stack_data[tos].name = lck;
+  p->s_top = tos;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_pop_parallel(int gtid, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  tos = p->stack_top;
+  KE_TRACE(10, ("__kmp_pop_parallel (%d %d)\n", gtid, __kmp_get_gtid()));
+  if (tos == 0 || p->p_top == 0) {
+    __kmp_error_construct(kmp_i18n_msg_CnsDetectedEnd, ct_parallel, ident);
+  }
+  if (tos != p->p_top || p->stack_data[tos].type != ct_parallel) {
+    __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct_parallel, ident,
+                           &p->stack_data[tos]);
+  }
+  KE_TRACE(100, (POP_MSG(p)));
+  p->p_top = p->stack_data[tos].prev;
+  p->stack_data[tos].type = ct_none;
+  p->stack_data[tos].ident = NULL;
+  p->stack_top = tos - 1;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+enum cons_type __kmp_pop_workshare(int gtid, enum cons_type ct,
+                                   ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+
+  tos = p->stack_top;
+  KE_TRACE(10, ("__kmp_pop_workshare (%d %d)\n", gtid, __kmp_get_gtid()));
+  if (tos == 0 || p->w_top == 0) {
+    __kmp_error_construct(kmp_i18n_msg_CnsDetectedEnd, ct, ident);
+  }
+
+  if (tos != p->w_top ||
+      (p->stack_data[tos].type != ct &&
+       // below is the exception to the rule that construct types must match
+       !(p->stack_data[tos].type == ct_pdo_ordered && ct == ct_pdo))) {
+    __kmp_check_null_func();
+    __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct, ident,
+                           &p->stack_data[tos]);
+  }
+  KE_TRACE(100, (POP_MSG(p)));
+  p->w_top = p->stack_data[tos].prev;
+  p->stack_data[tos].type = ct_none;
+  p->stack_data[tos].ident = NULL;
+  p->stack_top = tos - 1;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+  return p->stack_data[p->w_top].type;
+}
+
+void __kmp_pop_sync(int gtid, enum cons_type ct, ident_t const *ident) {
+  int tos;
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  tos = p->stack_top;
+  KE_TRACE(10, ("__kmp_pop_sync (%d %d)\n", gtid, __kmp_get_gtid()));
+  if (tos == 0 || p->s_top == 0) {
+    __kmp_error_construct(kmp_i18n_msg_CnsDetectedEnd, ct, ident);
+  }
+  if (tos != p->s_top || p->stack_data[tos].type != ct) {
+    __kmp_check_null_func();
+    __kmp_error_construct2(kmp_i18n_msg_CnsExpectedEnd, ct, ident,
+                           &p->stack_data[tos]);
+  }
+  if (gtid < 0) {
+    __kmp_check_null_func();
+  }
+  KE_TRACE(100, (POP_MSG(p)));
+  p->s_top = p->stack_data[tos].prev;
+  p->stack_data[tos].type = ct_none;
+  p->stack_data[tos].ident = NULL;
+  p->stack_top = tos - 1;
+  KE_DUMP(1000, dump_cons_stack(gtid, p));
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_check_barrier(int gtid, enum cons_type ct, ident_t const *ident) {
+  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+  KE_TRACE(10, ("__kmp_check_barrier (loc: %p, gtid: %d %d)\n", ident, gtid,
+                __kmp_get_gtid()));
+  if (ident != 0) {
+    __kmp_check_null_func();
+  }
+  if (p->w_top > p->p_top) {
+    /* we are already in a WORKSHARING construct for this PARALLEL region */
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->w_top]);
+  }
+  if (p->s_top > p->p_top) {
+    /* we are already in a SYNC construct for this PARALLEL region */
+    __kmp_error_construct2(kmp_i18n_msg_CnsInvalidNesting, ct, ident,
+                           &p->stack_data[p->s_top]);
+  }
+}
diff --git a/final/runtime/src/kmp_error.h b/final/runtime/src/kmp_error.h
new file mode 100644
index 0000000..fe6fd34
--- /dev/null
+++ b/final/runtime/src/kmp_error.h
@@ -0,0 +1,60 @@
+/*
+ * kmp_error.h -- PTS functions for error checking at runtime.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ERROR_H
+#define KMP_ERROR_H
+
+#include "kmp_i18n.h"
+
+/* ------------------------------------------------------------------------ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void __kmp_error_construct(kmp_i18n_id_t id, enum cons_type ct,
+                           ident_t const *ident);
+void __kmp_error_construct2(kmp_i18n_id_t id, enum cons_type ct,
+                            ident_t const *ident, struct cons_data const *cons);
+
+struct cons_header *__kmp_allocate_cons_stack(int gtid);
+void __kmp_free_cons_stack(void *ptr);
+
+void __kmp_push_parallel(int gtid, ident_t const *ident);
+void __kmp_push_workshare(int gtid, enum cons_type ct, ident_t const *ident);
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_push_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                     kmp_user_lock_p name, kmp_uint32);
+#else
+void __kmp_push_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                     kmp_user_lock_p name);
+#endif
+
+void __kmp_check_workshare(int gtid, enum cons_type ct, ident_t const *ident);
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_check_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                      kmp_user_lock_p name, kmp_uint32);
+#else
+void __kmp_check_sync(int gtid, enum cons_type ct, ident_t const *ident,
+                      kmp_user_lock_p name);
+#endif
+
+void __kmp_pop_parallel(int gtid, ident_t const *ident);
+enum cons_type __kmp_pop_workshare(int gtid, enum cons_type ct,
+                                   ident_t const *ident);
+void __kmp_pop_sync(int gtid, enum cons_type ct, ident_t const *ident);
+void __kmp_check_barrier(int gtid, enum cons_type ct, ident_t const *ident);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // KMP_ERROR_H
diff --git a/final/runtime/src/kmp_ftn_cdecl.cpp b/final/runtime/src/kmp_ftn_cdecl.cpp
new file mode 100644
index 0000000..cf1d429
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_cdecl.cpp
@@ -0,0 +1,34 @@
+/*
+ * kmp_ftn_cdecl.cpp -- Fortran __cdecl linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+
+#if KMP_OS_WINDOWS
+#if defined KMP_WIN_CDECL || !KMP_DYNAMIC_LIB
+#define KMP_FTN_ENTRIES KMP_FTN_UPPER
+#endif
+#elif KMP_OS_UNIX
+#define KMP_FTN_ENTRIES KMP_FTN_PLAIN
+#endif
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftncdecl[] =
+    KMP_VERSION_PREFIX "Fortran __cdecl OMP support: "
+#ifdef KMP_FTN_ENTRIES
+                       "yes";
+#define FTN_STDCALL /* no stdcall */
+#include "kmp_ftn_os.h"
+#include "kmp_ftn_entry.h"
+#else
+                       "no";
+#endif /* KMP_FTN_ENTRIES */
diff --git a/final/runtime/src/kmp_ftn_entry.h b/final/runtime/src/kmp_ftn_entry.h
new file mode 100644
index 0000000..e480e01
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_entry.h
@@ -0,0 +1,1481 @@
+/*
+ * kmp_ftn_entry.h -- Fortran entry linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FTN_STDCALL
+#error The support file kmp_ftn_entry.h should not be compiled by itself.
+#endif
+
+#ifdef KMP_STUB
+#include "kmp_stub.h"
+#endif
+
+#include "kmp_i18n.h"
+
+// For affinity format functions
+#include "kmp_io.h"
+#include "kmp_str.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/* For compatibility with the Gnu/MS Open MP codegen, omp_set_num_threads(),
+ * omp_set_nested(), and omp_set_dynamic() [in lowercase on MS, and w/o
+ * a trailing underscore on Linux* OS] take call by value integer arguments.
+ * + omp_set_max_active_levels()
+ * + omp_set_schedule()
+ *
+ * For backward compatibility with 9.1 and previous Intel compiler, these
+ * entry points take call by reference integer arguments. */
+#ifdef KMP_GOMP_COMPAT
+#if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) || (KMP_FTN_ENTRIES == KMP_FTN_UPPER)
+#define PASS_ARGS_BY_VALUE 1
+#endif
+#endif
+#if KMP_OS_WINDOWS
+#if (KMP_FTN_ENTRIES == KMP_FTN_PLAIN) || (KMP_FTN_ENTRIES == KMP_FTN_APPEND)
+#define PASS_ARGS_BY_VALUE 1
+#endif
+#endif
+
+// This macro helps to reduce code duplication.
+#ifdef PASS_ARGS_BY_VALUE
+#define KMP_DEREF
+#else
+#define KMP_DEREF *
+#endif
+
+void FTN_STDCALL FTN_SET_STACKSIZE(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_stacksize(KMP_DEREF arg);
+#else
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize((size_t)KMP_DEREF arg);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_STACKSIZE_S(size_t KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_stacksize(KMP_DEREF arg);
+#else
+  // __kmp_aux_set_stacksize initializes the library if needed
+  __kmp_aux_set_stacksize(KMP_DEREF arg);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_STACKSIZE(void) {
+#ifdef KMP_STUB
+  return __kmps_get_stacksize();
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return (int)__kmp_stksize;
+#endif
+}
+
+size_t FTN_STDCALL FTN_GET_STACKSIZE_S(void) {
+#ifdef KMP_STUB
+  return __kmps_get_stacksize();
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_stksize;
+#endif
+}
+
+void FTN_STDCALL FTN_SET_BLOCKTIME(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_blocktime(KMP_DEREF arg);
+#else
+  int gtid, tid;
+  kmp_info_t *thread;
+
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+  thread = __kmp_thread_from_gtid(gtid);
+
+  __kmp_aux_set_blocktime(KMP_DEREF arg, thread, tid);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
+#ifdef KMP_STUB
+  return __kmps_get_blocktime();
+#else
+  int gtid, tid;
+  kmp_info_t *thread;
+  kmp_team_p *team;
+
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+  thread = __kmp_thread_from_gtid(gtid);
+  team = __kmp_threads[gtid]->th.th_team;
+
+  /* These must match the settings used in __kmp_wait_sleep() */
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
+                  team->t.t_id, tid, KMP_MAX_BLOCKTIME));
+    return KMP_MAX_BLOCKTIME;
+  }
+#ifdef KMP_ADJUST_BLOCKTIME
+  else if (__kmp_zero_bt && !get__bt_set(team, tid)) {
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
+                  team->t.t_id, tid, 0));
+    return 0;
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+  else {
+    KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
+                  team->t.t_id, tid, get__blocktime(team, tid)));
+    return get__blocktime(team, tid);
+  }
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY_SERIAL(void) {
+#ifdef KMP_STUB
+  __kmps_set_library(library_serial);
+#else
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(library_serial);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY_TURNAROUND(void) {
+#ifdef KMP_STUB
+  __kmps_set_library(library_turnaround);
+#else
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(library_turnaround);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY_THROUGHPUT(void) {
+#ifdef KMP_STUB
+  __kmps_set_library(library_throughput);
+#else
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(library_throughput);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_LIBRARY(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  __kmps_set_library(KMP_DEREF arg);
+#else
+  enum library_type lib;
+  lib = (enum library_type)KMP_DEREF arg;
+  // __kmp_user_set_library initializes the library if needed
+  __kmp_user_set_library(lib);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_LIBRARY(void) {
+#ifdef KMP_STUB
+  return __kmps_get_library();
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return ((int)__kmp_library);
+#endif
+}
+
+void FTN_STDCALL FTN_SET_DISP_NUM_BUFFERS(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+  ; // empty routine
+#else
+  // ignore after initialization because some teams have already
+  // allocated dispatch buffers
+  if (__kmp_init_serial == 0 && (KMP_DEREF arg) > 0)
+    __kmp_dispatch_num_buffers = KMP_DEREF arg;
+#endif
+}
+
+int FTN_STDCALL FTN_SET_AFFINITY(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_set_affinity(mask);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_AFFINITY(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_get_affinity(mask);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_AFFINITY_MAX_PROC(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  // We really only NEED serial initialization here.
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_get_affinity_max_proc();
+#endif
+}
+
+void FTN_STDCALL FTN_CREATE_AFFINITY_MASK(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  *mask = NULL;
+#else
+  // We really only NEED serial initialization here.
+  kmp_affin_mask_t *mask_internals;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  mask_internals = __kmp_affinity_dispatch->allocate_mask();
+  KMP_CPU_ZERO(mask_internals);
+  *mask = mask_internals;
+#endif
+}
+
+void FTN_STDCALL FTN_DESTROY_AFFINITY_MASK(void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+// Nothing
+#else
+  // We really only NEED serial initialization here.
+  kmp_affin_mask_t *mask_internals;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (__kmp_env_consistency_check) {
+    if (*mask == NULL) {
+      KMP_FATAL(AffinityInvalidMask, "kmp_destroy_affinity_mask");
+    }
+  }
+  mask_internals = (kmp_affin_mask_t *)(*mask);
+  __kmp_affinity_dispatch->deallocate_mask(mask_internals);
+  *mask = NULL;
+#endif
+}
+
+int FTN_STDCALL FTN_SET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_set_affinity_mask_proc(KMP_DEREF proc, mask);
+#endif
+}
+
+int FTN_STDCALL FTN_UNSET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_unset_affinity_mask_proc(KMP_DEREF proc, mask);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_AFFINITY_MASK_PROC(int KMP_DEREF proc, void **mask) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_aux_get_affinity_mask_proc(KMP_DEREF proc, mask);
+#endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* sets the requested number of threads for the next parallel region */
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NUM_THREADS)(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  __kmp_set_num_threads(KMP_DEREF arg, __kmp_entry_gtid());
+#endif
+}
+
+/* returns the number of threads in current team */
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_THREADS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  // __kmpc_bound_num_threads initializes the library if needed
+  return __kmpc_bound_num_threads(NULL);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_THREADS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  int gtid;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_threads[gtid];
+  // return thread -> th.th_team -> t.t_current_task[
+  // thread->th.th_info.ds.ds_tid ] -> icvs.nproc;
+  return thread->th.th_current_task->td_icvs.nproc;
+#endif
+}
+
+int FTN_STDCALL FTN_CONTROL_TOOL(int command, int modifier, void *arg) {
+#if defined(KMP_STUB) || !OMPT_SUPPORT
+  return -2;
+#else
+  OMPT_STORE_RETURN_ADDRESS(__kmp_entry_gtid());
+  if (!TCR_4(__kmp_init_middle)) {
+    return -2;
+  }
+  kmp_info_t *this_thr = __kmp_threads[__kmp_entry_gtid()];
+  ompt_task_info_t *parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
+  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  int ret = __kmp_control_tool(command, modifier, arg);
+  parent_task_info->frame.enter_frame.ptr = 0;
+  return ret;
+#endif
+}
+
+/* OpenMP 5.0 Memory Management support */
+omp_allocator_handle_t FTN_STDCALL
+FTN_INIT_ALLOCATOR(omp_memspace_handle_t KMP_DEREF m, int KMP_DEREF ntraits,
+                   omp_alloctrait_t tr[]) {
+#ifdef KMP_STUB
+  return NULL;
+#else
+  return __kmpc_init_allocator(__kmp_entry_gtid(), KMP_DEREF m,
+                               KMP_DEREF ntraits, tr);
+#endif
+}
+
+void FTN_STDCALL FTN_DESTROY_ALLOCATOR(omp_allocator_handle_t al) {
+#ifndef KMP_STUB
+  __kmpc_destroy_allocator(__kmp_entry_gtid(), al);
+#endif
+}
+void FTN_STDCALL FTN_SET_DEFAULT_ALLOCATOR(omp_allocator_handle_t al) {
+#ifndef KMP_STUB
+  __kmpc_set_default_allocator(__kmp_entry_gtid(), al);
+#endif
+}
+omp_allocator_handle_t FTN_STDCALL FTN_GET_DEFAULT_ALLOCATOR(void) {
+#ifdef KMP_STUB
+  return NULL;
+#else
+  return __kmpc_get_default_allocator(__kmp_entry_gtid());
+#endif
+}
+
+/* OpenMP 5.0 affinity format support */
+#ifndef KMP_STUB
+static void __kmp_fortran_strncpy_truncate(char *buffer, size_t buf_size,
+                                           char const *csrc, size_t csrc_size) {
+  size_t capped_src_size = csrc_size;
+  if (csrc_size >= buf_size) {
+    capped_src_size = buf_size - 1;
+  }
+  KMP_STRNCPY_S(buffer, buf_size, csrc, capped_src_size);
+  if (csrc_size >= buf_size) {
+    KMP_DEBUG_ASSERT(buffer[buf_size - 1] == '\0');
+    buffer[buf_size - 1] = csrc[buf_size - 1];
+  } else {
+    for (size_t i = csrc_size; i < buf_size; ++i)
+      buffer[i] = ' ';
+  }
+}
+
+// Convert a Fortran string to a C string by adding null byte
+class ConvertedString {
+  char *buf;
+  kmp_info_t *th;
+
+public:
+  ConvertedString(char const *fortran_str, size_t size) {
+    th = __kmp_get_thread();
+    buf = (char *)__kmp_thread_malloc(th, size + 1);
+    KMP_STRNCPY_S(buf, size + 1, fortran_str, size);
+    buf[size] = '\0';
+  }
+  ~ConvertedString() { __kmp_thread_free(th, buf); }
+  const char *get() const { return buf; }
+};
+#endif // KMP_STUB
+
+/*
+ * Set the value of the affinity-format-var ICV on the current device to the
+ * format specified in the argument.
+*/
+void FTN_STDCALL FTN_SET_AFFINITY_FORMAT(char const *format, size_t size) {
+#ifdef KMP_STUB
+  return;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  ConvertedString cformat(format, size);
+  // Since the __kmp_affinity_format variable is a C string, do not
+  // use the fortran strncpy function
+  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
+                         cformat.get(), KMP_STRLEN(cformat.get()));
+#endif
+}
+
+/*
+ * Returns the number of characters required to hold the entire affinity format
+ * specification (not including null byte character) and writes the value of the
+ * affinity-format-var ICV on the current device to buffer. If the return value
+ * is larger than size, the affinity format specification is truncated.
+*/
+size_t FTN_STDCALL FTN_GET_AFFINITY_FORMAT(char *buffer, size_t size) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  size_t format_size;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  format_size = KMP_STRLEN(__kmp_affinity_format);
+  if (buffer && size) {
+    __kmp_fortran_strncpy_truncate(buffer, size, __kmp_affinity_format,
+                                   format_size);
+  }
+  return format_size;
+#endif
+}
+
+/*
+ * Prints the thread affinity information of the current thread in the format
+ * specified by the format argument. If the format is NULL or a zero-length
+ * string, the value of the affinity-format-var ICV is used.
+*/
+void FTN_STDCALL FTN_DISPLAY_AFFINITY(char const *format, size_t size) {
+#ifdef KMP_STUB
+  return;
+#else
+  int gtid;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  gtid = __kmp_get_gtid();
+  ConvertedString cformat(format, size);
+  __kmp_aux_display_affinity(gtid, cformat.get());
+#endif
+}
+
+/*
+ * Returns the number of characters required to hold the entire affinity format
+ * specification (not including null byte) and prints the thread affinity
+ * information of the current thread into the character string buffer with the
+ * size of size in the format specified by the format argument. If the format is
+ * NULL or a zero-length string, the value of the affinity-format-var ICV is
+ * used. The buffer must be allocated prior to calling the routine. If the
+ * return value is larger than size, the affinity format specification is
+ * truncated.
+*/
+size_t FTN_STDCALL FTN_CAPTURE_AFFINITY(char *buffer, char const *format,
+                                        size_t buf_size, size_t for_size) {
+#if defined(KMP_STUB)
+  return 0;
+#else
+  int gtid;
+  size_t num_required;
+  kmp_str_buf_t capture_buf;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  gtid = __kmp_get_gtid();
+  __kmp_str_buf_init(&capture_buf);
+  ConvertedString cformat(format, for_size);
+  num_required = __kmp_aux_capture_affinity(gtid, cformat.get(), &capture_buf);
+  if (buffer && buf_size) {
+    __kmp_fortran_strncpy_truncate(buffer, buf_size, capture_buf.str,
+                                   capture_buf.used);
+  }
+  __kmp_str_buf_free(&capture_buf);
+  return num_required;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  int gtid;
+
+#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
+        KMP_OS_HURD
+  gtid = __kmp_entry_gtid();
+#elif KMP_OS_WINDOWS
+  if (!__kmp_init_parallel ||
+      (gtid = (int)((kmp_intptr_t)TlsGetValue(__kmp_gtid_threadprivate_key))) ==
+          0) {
+    // Either library isn't initialized or thread is not registered
+    // 0 is the correct TID in this case
+    return 0;
+  }
+  --gtid; // We keep (gtid+1) in TLS
+#elif KMP_OS_LINUX
+#ifdef KMP_TDATA_GTID
+  if (__kmp_gtid_mode >= 3) {
+    if ((gtid = __kmp_gtid) == KMP_GTID_DNE) {
+      return 0;
+    }
+  } else {
+#endif
+    if (!__kmp_init_parallel ||
+        (gtid = (kmp_intptr_t)(
+             pthread_getspecific(__kmp_gtid_threadprivate_key))) == 0) {
+      return 0;
+    }
+    --gtid;
+#ifdef KMP_TDATA_GTID
+  }
+#endif
+#else
+#error Unknown or unsupported OS
+#endif
+
+  return __kmp_tid_from_gtid(gtid);
+#endif
+}
+
+int FTN_STDCALL FTN_GET_NUM_KNOWN_THREADS(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  /* NOTE: this is not syncronized, so it can change at any moment */
+  /* NOTE: this number also includes threads preallocated in hot-teams */
+  return TCR_4(__kmp_nth);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PROCS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  return __kmp_avail_proc;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NESTED)(int KMP_DEREF flag) {
+  KMP_INFORM(APIDeprecated, "omp_set_nested", "omp_set_max_active_levels");
+#ifdef KMP_STUB
+  __kmps_set_nested(KMP_DEREF flag);
+#else
+  kmp_info_t *thread;
+  /* For the thread-private internal controls implementation */
+  thread = __kmp_entry_thread();
+  __kmp_save_internal_controls(thread);
+  // Somewhat arbitrarily decide where to get a value for max_active_levels
+  int max_active_levels = get__max_active_levels(thread);
+  if (max_active_levels == 1)
+    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  set__max_active_levels(thread, (KMP_DEREF flag) ? max_active_levels : 1);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NESTED)(void) {
+  KMP_INFORM(APIDeprecated, "omp_get_nested", "omp_get_max_active_levels");
+#ifdef KMP_STUB
+  return __kmps_get_nested();
+#else
+  kmp_info_t *thread;
+  thread = __kmp_entry_thread();
+  return get__max_active_levels(thread) > 1;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DYNAMIC)(int KMP_DEREF flag) {
+#ifdef KMP_STUB
+  __kmps_set_dynamic(KMP_DEREF flag ? TRUE : FALSE);
+#else
+  kmp_info_t *thread;
+  /* For the thread-private implementation of the internal controls */
+  thread = __kmp_entry_thread();
+  // !!! What if foreign thread calls it?
+  __kmp_save_internal_controls(thread);
+  set__dynamic(thread, KMP_DEREF flag ? TRUE : FALSE);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_DYNAMIC)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_dynamic();
+#else
+  kmp_info_t *thread;
+  thread = __kmp_entry_thread();
+  return get__dynamic(thread);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IN_PARALLEL)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  kmp_info_t *th = __kmp_entry_thread();
+  if (th->th.th_teams_microtask) {
+    // AC: r_in_parallel does not work inside teams construct where real
+    // parallel is inactive, but all threads have same root, so setting it in
+    // one team affects other teams.
+    // The solution is to use per-team nesting level
+    return (th->th.th_team->t.t_active_level ? 1 : 0);
+  } else
+    return (th->th.th_root->r.r_in_parallel ? FTN_TRUE : FTN_FALSE);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_SCHEDULE)(kmp_sched_t KMP_DEREF kind,
+                                                   int KMP_DEREF modifier) {
+#ifdef KMP_STUB
+  __kmps_set_schedule(KMP_DEREF kind, KMP_DEREF modifier);
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  __kmp_set_schedule(__kmp_entry_gtid(), KMP_DEREF kind, KMP_DEREF modifier);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_SCHEDULE)(kmp_sched_t *kind,
+                                                   int *modifier) {
+#ifdef KMP_STUB
+  __kmps_get_schedule(kind, modifier);
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  __kmp_get_schedule(__kmp_entry_gtid(), kind, modifier);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_MAX_ACTIVE_LEVELS)(int KMP_DEREF arg) {
+#ifdef KMP_STUB
+// Nothing.
+#else
+  /* TO DO: We want per-task implementation of this internal control */
+  __kmp_set_max_active_levels(__kmp_entry_gtid(), KMP_DEREF arg);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_ACTIVE_LEVELS)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  /* TO DO: We want per-task implementation of this internal control */
+  return __kmp_get_max_active_levels(__kmp_entry_gtid());
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_ACTIVE_LEVEL)(void) {
+#ifdef KMP_STUB
+  return 0; // returns 0 if it is called from the sequential part of the program
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  return __kmp_entry_thread()->th.th_team->t.t_active_level;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_LEVEL)(void) {
+#ifdef KMP_STUB
+  return 0; // returns 0 if it is called from the sequential part of the program
+#else
+  /* TO DO: For the per-task implementation of the internal controls */
+  return __kmp_entry_thread()->th.th_team->t.t_level;
+#endif
+}
+
+int FTN_STDCALL
+    KMP_EXPAND_NAME(FTN_GET_ANCESTOR_THREAD_NUM)(int KMP_DEREF level) {
+#ifdef KMP_STUB
+  return (KMP_DEREF level) ? (-1) : (0);
+#else
+  return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), KMP_DEREF level);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_TEAM_SIZE)(int KMP_DEREF level) {
+#ifdef KMP_STUB
+  return (KMP_DEREF level) ? (-1) : (1);
+#else
+  return __kmp_get_team_size(__kmp_entry_gtid(), KMP_DEREF level);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) {
+#ifdef KMP_STUB
+  return 1; // TO DO: clarify whether it returns 1 or 0?
+#else
+  int gtid;
+  kmp_info_t *thread;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_threads[gtid];
+  return thread->th.th_current_task->td_icvs.thread_limit;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IN_FINAL)(void) {
+#ifdef KMP_STUB
+  return 0; // TO DO: clarify whether it returns 1 or 0?
+#else
+  if (!TCR_4(__kmp_init_parallel)) {
+    return 0;
+  }
+  return __kmp_entry_thread()->th.th_current_task->td_flags.final;
+#endif
+}
+
+kmp_proc_bind_t FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PROC_BIND)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_proc_bind();
+#else
+  return get__proc_bind(__kmp_entry_thread());
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_PLACES)(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  return __kmp_affinity_num_masks;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM_PROCS)(int place_num) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int i;
+  int retval = 0;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
+    return 0;
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
+  KMP_CPU_SET_ITERATE(i, mask) {
+    if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
+        (!KMP_CPU_ISSET(i, mask))) {
+      continue;
+    }
+    ++retval;
+  }
+  return retval;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_PROC_IDS)(int place_num,
+                                                         int *ids) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+// Nothing.
+#else
+  int i, j;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
+    return;
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
+  j = 0;
+  KMP_CPU_SET_ITERATE(i, mask) {
+    if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
+        (!KMP_CPU_ISSET(i, mask))) {
+      continue;
+    }
+    ids[j++] = i;
+  }
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PLACE_NUM)(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  int gtid;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return -1;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread->th.th_current_place < 0)
+    return -1;
+  return thread->th.th_current_place;
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_NUM_PLACES)(void) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int gtid, num_places, first_place, last_place;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  first_place = thread->th.th_first_place;
+  last_place = thread->th.th_last_place;
+  if (first_place < 0 || last_place < 0)
+    return 0;
+  if (first_place <= last_place)
+    num_places = last_place - first_place + 1;
+  else
+    num_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+  return num_places;
+#endif
+}
+
+void
+    FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_PARTITION_PLACE_NUMS)(int *place_nums) {
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+// Nothing.
+#else
+  int i, gtid, place_num, first_place, last_place, start, end;
+  kmp_info_t *thread;
+  if (!TCR_4(__kmp_init_middle)) {
+    __kmp_middle_initialize();
+  }
+  if (!KMP_AFFINITY_CAPABLE())
+    return;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  first_place = thread->th.th_first_place;
+  last_place = thread->th.th_last_place;
+  if (first_place < 0 || last_place < 0)
+    return;
+  if (first_place <= last_place) {
+    start = first_place;
+    end = last_place;
+  } else {
+    start = last_place;
+    end = first_place;
+  }
+  for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
+    place_nums[i] = place_num;
+  }
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_TEAMS)(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  return __kmp_aux_get_num_teams();
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_TEAM_NUM)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  return __kmp_aux_get_team_num();
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_DEFAULT_DEVICE)(void) {
+#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+  return 0;
+#else
+  return __kmp_entry_thread()->th.th_current_task->td_icvs.default_device;
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_DEFAULT_DEVICE)(int KMP_DEREF arg) {
+#if KMP_MIC || KMP_OS_DARWIN || defined(KMP_STUB)
+// Nothing.
+#else
+  __kmp_entry_thread()->th.th_current_task->td_icvs.default_device =
+      KMP_DEREF arg;
+#endif
+}
+
+// Get number of NON-HOST devices.
+// libomptarget, if loaded, provides this function in api.cpp.
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_NUM_DEVICES)(void) {
+#if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+  return 0;
+#else
+  int (*fptr)();
+  if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "_Offload_number_of_devices"))) {
+    return (*fptr)();
+  } else if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_num_devices"))) {
+    return (*fptr)();
+  } else { // liboffload & libomptarget don't exist
+    return 0;
+  }
+#endif // KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+}
+
+// This function always returns true when called on host device.
+// Compilier/libomptarget should handle when it is called inside target region.
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_IS_INITIAL_DEVICE)(void) {
+  return 1; // This is the host
+}
+
+// libomptarget, if loaded, provides this function
+int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL FTN_GET_INITIAL_DEVICE(void) {
+#if KMP_MIC || KMP_OS_DARWIN || KMP_OS_WINDOWS || defined(KMP_STUB)
+  return KMP_HOST_DEVICE;
+#else
+  int (*fptr)();
+  if ((*(void **)(&fptr) = dlsym(RTLD_NEXT, "omp_get_initial_device"))) {
+    return (*fptr)();
+  } else { // liboffload & libomptarget don't exist
+    return KMP_HOST_DEVICE;
+  }
+#endif
+}
+
+#if defined(KMP_STUB)
+// Entries for stubs library
+// As all *target* functions are C-only parameters always passed by value
+void *FTN_STDCALL FTN_TARGET_ALLOC(size_t size, int device_num) { return 0; }
+
+void FTN_STDCALL FTN_TARGET_FREE(void *device_ptr, int device_num) {}
+
+int FTN_STDCALL FTN_TARGET_IS_PRESENT(void *ptr, int device_num) { return 0; }
+
+int FTN_STDCALL FTN_TARGET_MEMCPY(void *dst, void *src, size_t length,
+                                  size_t dst_offset, size_t src_offset,
+                                  int dst_device, int src_device) {
+  return -1;
+}
+
+int FTN_STDCALL FTN_TARGET_MEMCPY_RECT(
+    void *dst, void *src, size_t element_size, int num_dims,
+    const size_t *volume, const size_t *dst_offsets, const size_t *src_offsets,
+    const size_t *dst_dimensions, const size_t *src_dimensions, int dst_device,
+    int src_device) {
+  return -1;
+}
+
+int FTN_STDCALL FTN_TARGET_ASSOCIATE_PTR(void *host_ptr, void *device_ptr,
+                                         size_t size, size_t device_offset,
+                                         int device_num) {
+  return -1;
+}
+
+int FTN_STDCALL FTN_TARGET_DISASSOCIATE_PTR(void *host_ptr, int device_num) {
+  return -1;
+}
+#endif // defined(KMP_STUB)
+
+#ifdef KMP_STUB
+typedef enum { UNINIT = -1, UNLOCKED, LOCKED } kmp_stub_lock_t;
+#endif /* KMP_STUB */
+
+#if KMP_USE_DYNAMIC_LOCK
+void FTN_STDCALL FTN_INIT_LOCK_WITH_HINT(void **user_lock,
+                                         uintptr_t KMP_DEREF hint) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
+#endif
+}
+
+void FTN_STDCALL FTN_INIT_NEST_LOCK_WITH_HINT(void **user_lock,
+                                              uintptr_t KMP_DEREF hint) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_nest_lock_with_hint(NULL, gtid, user_lock, KMP_DEREF hint);
+#endif
+}
+#endif
+
+/* initialize the lock */
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_INIT_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_lock(NULL, gtid, user_lock);
+#endif
+}
+
+/* initialize the lock */
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_INIT_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_init_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_DESTROY_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNINIT;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_destroy_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_DESTROY_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  *((kmp_stub_lock_t *)user_lock) = UNINIT;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_destroy_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) != UNLOCKED) {
+    // TODO: Issue an error.
+  }
+  *((kmp_stub_lock_t *)user_lock) = LOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_set_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_SET_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  (*((int *)user_lock))++;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_set_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_UNSET_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) == UNLOCKED) {
+    // TODO: Issue an error.
+  }
+  *((kmp_stub_lock_t *)user_lock) = UNLOCKED;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_unset_lock(NULL, gtid, user_lock);
+#endif
+}
+
+void FTN_STDCALL KMP_EXPAND_NAME(FTN_UNSET_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) == UNLOCKED) {
+    // TODO: Issue an error.
+  }
+  (*((int *)user_lock))--;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_unset_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_TEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  if (*((kmp_stub_lock_t *)user_lock) == LOCKED) {
+    return 0;
+  }
+  *((kmp_stub_lock_t *)user_lock) = LOCKED;
+  return 1;
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmpc_test_lock(NULL, gtid, user_lock);
+#endif
+}
+
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_TEST_NEST_LOCK)(void **user_lock) {
+#ifdef KMP_STUB
+  if (*((kmp_stub_lock_t *)user_lock) == UNINIT) {
+    // TODO: Issue an error.
+  }
+  return ++(*((int *)user_lock));
+#else
+  int gtid = __kmp_entry_gtid();
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  return __kmpc_test_nest_lock(NULL, gtid, user_lock);
+#endif
+}
+
+double FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_WTIME)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_wtime();
+#else
+  double data;
+#if !KMP_OS_LINUX
+  // We don't need library initialization to get the time on Linux* OS. The
+  // routine can be used to measure library initialization time on Linux* OS now
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+#endif
+  __kmp_elapsed(&data);
+  return data;
+#endif
+}
+
+double FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_WTICK)(void) {
+#ifdef KMP_STUB
+  return __kmps_get_wtick();
+#else
+  double data;
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_elapsed_tick(&data);
+  return data;
+#endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+void *FTN_STDCALL FTN_MALLOC(size_t KMP_DEREF size) {
+  // kmpc_malloc initializes the library if needed
+  return kmpc_malloc(KMP_DEREF size);
+}
+
+void *FTN_STDCALL FTN_ALIGNED_MALLOC(size_t KMP_DEREF size,
+                                     size_t KMP_DEREF alignment) {
+  // kmpc_aligned_malloc initializes the library if needed
+  return kmpc_aligned_malloc(KMP_DEREF size, KMP_DEREF alignment);
+}
+
+void *FTN_STDCALL FTN_CALLOC(size_t KMP_DEREF nelem, size_t KMP_DEREF elsize) {
+  // kmpc_calloc initializes the library if needed
+  return kmpc_calloc(KMP_DEREF nelem, KMP_DEREF elsize);
+}
+
+void *FTN_STDCALL FTN_REALLOC(void *KMP_DEREF ptr, size_t KMP_DEREF size) {
+  // kmpc_realloc initializes the library if needed
+  return kmpc_realloc(KMP_DEREF ptr, KMP_DEREF size);
+}
+
+void FTN_STDCALL FTN_KFREE(void *KMP_DEREF ptr) {
+  // does nothing if the library is not initialized
+  kmpc_free(KMP_DEREF ptr);
+}
+
+void FTN_STDCALL FTN_SET_WARNINGS_ON(void) {
+#ifndef KMP_STUB
+  __kmp_generate_warnings = kmp_warnings_explicit;
+#endif
+}
+
+void FTN_STDCALL FTN_SET_WARNINGS_OFF(void) {
+#ifndef KMP_STUB
+  __kmp_generate_warnings = FALSE;
+#endif
+}
+
+void FTN_STDCALL FTN_SET_DEFAULTS(char const *str
+#ifndef PASS_ARGS_BY_VALUE
+                                  ,
+                                  int len
+#endif
+                                  ) {
+#ifndef KMP_STUB
+#ifdef PASS_ARGS_BY_VALUE
+  int len = (int)KMP_STRLEN(str);
+#endif
+  __kmp_aux_set_defaults(str, len);
+#endif
+}
+
+/* ------------------------------------------------------------------------ */
+
+/* returns the status of cancellation */
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_CANCELLATION)(void) {
+#ifdef KMP_STUB
+  return 0 /* false */;
+#else
+  // initialize the library if needed
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_omp_cancellation;
+#endif
+}
+
+int FTN_STDCALL FTN_GET_CANCELLATION_STATUS(int cancel_kind) {
+#ifdef KMP_STUB
+  return 0 /* false */;
+#else
+  return __kmp_get_cancellation_status(cancel_kind);
+#endif
+}
+
+/* returns the maximum allowed task priority */
+int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_MAX_TASK_PRIORITY)(void) {
+#ifdef KMP_STUB
+  return 0;
+#else
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  return __kmp_max_task_priority;
+#endif
+}
+
+// This function will be defined in libomptarget. When libomptarget is not
+// loaded, we assume we are on the host and return KMP_HOST_DEVICE.
+// Compiler/libomptarget will handle this if called inside target.
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) KMP_WEAK_ATTRIBUTE;
+int FTN_STDCALL FTN_GET_DEVICE_NUM(void) { return KMP_HOST_DEVICE; }
+
+// Compiler will ensure that this is only called from host in sequential region
+int FTN_STDCALL FTN_PAUSE_RESOURCE(kmp_pause_status_t kind, int device_num) {
+#ifdef KMP_STUB
+  return 1; // just fail
+#else
+  if (device_num == KMP_HOST_DEVICE)
+    return __kmpc_pause_resource(kind);
+  else {
+#if !KMP_OS_WINDOWS
+    int (*fptr)(kmp_pause_status_t, int);
+    if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
+      return (*fptr)(kind, device_num);
+    else
+#endif
+      return 1; // just fail if there is no libomptarget
+  }
+#endif
+}
+
+// Compiler will ensure that this is only called from host in sequential region
+int FTN_STDCALL FTN_PAUSE_RESOURCE_ALL(kmp_pause_status_t kind) {
+#ifdef KMP_STUB
+  return 1; // just fail
+#else
+  int fails = 0;
+#if !KMP_OS_WINDOWS
+  int (*fptr)(kmp_pause_status_t, int);
+  if ((*(void **)(&fptr) = dlsym(RTLD_DEFAULT, "tgt_pause_resource")))
+    fails = (*fptr)(kind, KMP_DEVICE_ALL); // pause devices
+#endif
+  fails += __kmpc_pause_resource(kind); // pause host
+  return fails;
+#endif
+}
+
+// Returns the maximum number of nesting levels supported by implementation
+int FTN_STDCALL FTN_GET_SUPPORTED_ACTIVE_LEVELS(void) {
+#ifdef KMP_STUB
+  return 1;
+#else
+  return KMP_MAX_ACTIVE_LEVELS_LIMIT;
+#endif
+}
+
+void FTN_STDCALL FTN_FULFILL_EVENT(kmp_event_t *event) {
+#ifndef KMP_STUB
+  __kmp_fulfill_event(event);
+#endif
+}
+
+// GCC compatibility (versioned symbols)
+#ifdef KMP_USE_VERSION_SYMBOLS
+
+/* These following sections create versioned symbols for the
+   omp_* routines. The KMP_VERSION_SYMBOL macro expands the API name and
+   then maps it to a versioned symbol.
+   libgomp ``versions'' its symbols (OMP_1.0, OMP_2.0, OMP_3.0, ...) while also
+   retaining the default version which libomp uses: VERSION (defined in
+   exports_so.txt). If you want to see the versioned symbols for libgomp.so.1
+   then just type:
+
+   objdump -T /path/to/libgomp.so.1 | grep omp_
+
+   Example:
+   Step 1) Create __kmp_api_omp_set_num_threads_10_alias which is alias of
+     __kmp_api_omp_set_num_threads
+   Step 2) Set __kmp_api_omp_set_num_threads_10_alias to version:
+     omp_set_num_threads@OMP_1.0
+   Step 2B) Set __kmp_api_omp_set_num_threads to default version:
+     omp_set_num_threads@@VERSION
+*/
+
+// OMP_1.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_SET_NUM_THREADS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_THREADS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_MAX_THREADS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_THREAD_NUM, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_PROCS, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_IN_PARALLEL, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_DYNAMIC, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_DYNAMIC, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_NESTED, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_GET_NESTED, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_INIT_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_INIT_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_SET_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_NEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_TEST_LOCK, 10, "OMP_1.0");
+KMP_VERSION_SYMBOL(FTN_TEST_NEST_LOCK, 10, "OMP_1.0");
+
+// OMP_2.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_GET_WTICK, 20, "OMP_2.0");
+KMP_VERSION_SYMBOL(FTN_GET_WTIME, 20, "OMP_2.0");
+
+// OMP_3.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_SET_SCHEDULE, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_SCHEDULE, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_THREAD_LIMIT, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_SET_MAX_ACTIVE_LEVELS, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_MAX_ACTIVE_LEVELS, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_ANCESTOR_THREAD_NUM, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_LEVEL, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_TEAM_SIZE, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_GET_ACTIVE_LEVEL, 30, "OMP_3.0");
+
+// the lock routines have a 1.0 and 3.0 version
+KMP_VERSION_SYMBOL(FTN_INIT_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_INIT_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_DESTROY_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_SET_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_SET_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_UNSET_NEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_TEST_LOCK, 30, "OMP_3.0");
+KMP_VERSION_SYMBOL(FTN_TEST_NEST_LOCK, 30, "OMP_3.0");
+
+// OMP_3.1 versioned symbol
+KMP_VERSION_SYMBOL(FTN_IN_FINAL, 31, "OMP_3.1");
+
+// OMP_4.0 versioned symbols
+KMP_VERSION_SYMBOL(FTN_GET_PROC_BIND, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_TEAMS, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_TEAM_NUM, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_CANCELLATION, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_DEFAULT_DEVICE, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_SET_DEFAULT_DEVICE, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_IS_INITIAL_DEVICE, 40, "OMP_4.0");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_DEVICES, 40, "OMP_4.0");
+
+// OMP_4.5 versioned symbols
+KMP_VERSION_SYMBOL(FTN_GET_MAX_TASK_PRIORITY, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_NUM_PLACES, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM_PROCS, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_PROC_IDS, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PLACE_NUM, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PARTITION_NUM_PLACES, 45, "OMP_4.5");
+KMP_VERSION_SYMBOL(FTN_GET_PARTITION_PLACE_NUMS, 45, "OMP_4.5");
+// KMP_VERSION_SYMBOL(FTN_GET_INITIAL_DEVICE, 45, "OMP_4.5");
+
+// OMP_5.0 versioned symbols
+// KMP_VERSION_SYMBOL(FTN_GET_DEVICE_NUM, 50, "OMP_5.0");
+// KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE, 50, "OMP_5.0");
+// KMP_VERSION_SYMBOL(FTN_PAUSE_RESOURCE_ALL, 50, "OMP_5.0");
+// KMP_VERSION_SYMBOL(FTN_GET_SUPPORTED_ACTIVE_LEVELS, 50, "OMP_5.0");
+// KMP_VERSION_SYMBOL(FTN_FULFILL_EVENT, 50, "OMP_5.0");
+
+#endif // KMP_USE_VERSION_SYMBOLS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+// end of file //
diff --git a/final/runtime/src/kmp_ftn_extra.cpp b/final/runtime/src/kmp_ftn_extra.cpp
new file mode 100644
index 0000000..74b3e96
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_extra.cpp
@@ -0,0 +1,32 @@
+/*
+ * kmp_ftn_extra.cpp -- Fortran 'extra' linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+
+#if KMP_OS_WINDOWS
+#define KMP_FTN_ENTRIES KMP_FTN_PLAIN
+#elif KMP_OS_UNIX
+#define KMP_FTN_ENTRIES KMP_FTN_APPEND
+#endif
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftnextra[] =
+    KMP_VERSION_PREFIX "Fortran \"extra\" OMP support: "
+#ifdef KMP_FTN_ENTRIES
+                       "yes";
+#define FTN_STDCALL /* nothing to do */
+#include "kmp_ftn_os.h"
+#include "kmp_ftn_entry.h"
+#else
+                       "no";
+#endif /* KMP_FTN_ENTRIES */
diff --git a/final/runtime/src/kmp_ftn_os.h b/final/runtime/src/kmp_ftn_os.h
new file mode 100644
index 0000000..856479c
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_os.h
@@ -0,0 +1,637 @@
+/*
+ * kmp_ftn_os.h -- KPTS Fortran defines header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_FTN_OS_H
+#define KMP_FTN_OS_H
+
+// KMP_FNT_ENTRIES may be one of: KMP_FTN_PLAIN, KMP_FTN_UPPER, KMP_FTN_APPEND,
+// KMP_FTN_UAPPEND.
+
+/* -------------------------- External definitions ------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_PLAIN
+
+#define FTN_SET_STACKSIZE kmp_set_stacksize
+#define FTN_SET_STACKSIZE_S kmp_set_stacksize_s
+#define FTN_GET_STACKSIZE kmp_get_stacksize
+#define FTN_GET_STACKSIZE_S kmp_get_stacksize_s
+#define FTN_SET_BLOCKTIME kmp_set_blocktime
+#define FTN_GET_BLOCKTIME kmp_get_blocktime
+#define FTN_SET_LIBRARY_SERIAL kmp_set_library_serial
+#define FTN_SET_LIBRARY_TURNAROUND kmp_set_library_turnaround
+#define FTN_SET_LIBRARY_THROUGHPUT kmp_set_library_throughput
+#define FTN_SET_LIBRARY kmp_set_library
+#define FTN_GET_LIBRARY kmp_get_library
+#define FTN_SET_DEFAULTS kmp_set_defaults
+#define FTN_SET_DISP_NUM_BUFFERS kmp_set_disp_num_buffers
+#define FTN_SET_AFFINITY kmp_set_affinity
+#define FTN_GET_AFFINITY kmp_get_affinity
+#define FTN_GET_AFFINITY_MAX_PROC kmp_get_affinity_max_proc
+#define FTN_CREATE_AFFINITY_MASK kmp_create_affinity_mask
+#define FTN_DESTROY_AFFINITY_MASK kmp_destroy_affinity_mask
+#define FTN_SET_AFFINITY_MASK_PROC kmp_set_affinity_mask_proc
+#define FTN_UNSET_AFFINITY_MASK_PROC kmp_unset_affinity_mask_proc
+#define FTN_GET_AFFINITY_MASK_PROC kmp_get_affinity_mask_proc
+
+#define FTN_MALLOC kmp_malloc
+#define FTN_ALIGNED_MALLOC kmp_aligned_malloc
+#define FTN_CALLOC kmp_calloc
+#define FTN_REALLOC kmp_realloc
+#define FTN_KFREE kmp_free
+
+#define FTN_GET_NUM_KNOWN_THREADS kmp_get_num_known_threads
+
+#define FTN_SET_NUM_THREADS omp_set_num_threads
+#define FTN_GET_NUM_THREADS omp_get_num_threads
+#define FTN_GET_MAX_THREADS omp_get_max_threads
+#define FTN_GET_THREAD_NUM omp_get_thread_num
+#define FTN_GET_NUM_PROCS omp_get_num_procs
+#define FTN_SET_DYNAMIC omp_set_dynamic
+#define FTN_GET_DYNAMIC omp_get_dynamic
+#define FTN_SET_NESTED omp_set_nested
+#define FTN_GET_NESTED omp_get_nested
+#define FTN_IN_PARALLEL omp_in_parallel
+#define FTN_GET_THREAD_LIMIT omp_get_thread_limit
+#define FTN_SET_SCHEDULE omp_set_schedule
+#define FTN_GET_SCHEDULE omp_get_schedule
+#define FTN_SET_MAX_ACTIVE_LEVELS omp_set_max_active_levels
+#define FTN_GET_MAX_ACTIVE_LEVELS omp_get_max_active_levels
+#define FTN_GET_ACTIVE_LEVEL omp_get_active_level
+#define FTN_GET_LEVEL omp_get_level
+#define FTN_GET_ANCESTOR_THREAD_NUM omp_get_ancestor_thread_num
+#define FTN_GET_TEAM_SIZE omp_get_team_size
+#define FTN_IN_FINAL omp_in_final
+#define FTN_GET_PROC_BIND omp_get_proc_bind
+#define FTN_GET_NUM_TEAMS omp_get_num_teams
+#define FTN_GET_TEAM_NUM omp_get_team_num
+#define FTN_INIT_LOCK omp_init_lock
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT omp_init_lock_with_hint
+#define FTN_INIT_NEST_LOCK_WITH_HINT omp_init_nest_lock_with_hint
+#endif
+#define FTN_DESTROY_LOCK omp_destroy_lock
+#define FTN_SET_LOCK omp_set_lock
+#define FTN_UNSET_LOCK omp_unset_lock
+#define FTN_TEST_LOCK omp_test_lock
+#define FTN_INIT_NEST_LOCK omp_init_nest_lock
+#define FTN_DESTROY_NEST_LOCK omp_destroy_nest_lock
+#define FTN_SET_NEST_LOCK omp_set_nest_lock
+#define FTN_UNSET_NEST_LOCK omp_unset_nest_lock
+#define FTN_TEST_NEST_LOCK omp_test_nest_lock
+
+#define FTN_SET_WARNINGS_ON kmp_set_warnings_on
+#define FTN_SET_WARNINGS_OFF kmp_set_warnings_off
+
+#define FTN_GET_WTIME omp_get_wtime
+#define FTN_GET_WTICK omp_get_wtick
+
+#define FTN_GET_NUM_DEVICES omp_get_num_devices
+#define FTN_GET_DEFAULT_DEVICE omp_get_default_device
+#define FTN_SET_DEFAULT_DEVICE omp_set_default_device
+#define FTN_IS_INITIAL_DEVICE omp_is_initial_device
+
+#define FTN_GET_CANCELLATION omp_get_cancellation
+#define FTN_GET_CANCELLATION_STATUS kmp_get_cancellation_status
+
+#define FTN_GET_MAX_TASK_PRIORITY omp_get_max_task_priority
+#define FTN_GET_NUM_PLACES omp_get_num_places
+#define FTN_GET_PLACE_NUM_PROCS omp_get_place_num_procs
+#define FTN_GET_PLACE_PROC_IDS omp_get_place_proc_ids
+#define FTN_GET_PLACE_NUM omp_get_place_num
+#define FTN_GET_PARTITION_NUM_PLACES omp_get_partition_num_places
+#define FTN_GET_PARTITION_PLACE_NUMS omp_get_partition_place_nums
+#define FTN_GET_INITIAL_DEVICE omp_get_initial_device
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC omp_target_alloc
+#define FTN_TARGET_FREE omp_target_free
+#define FTN_TARGET_IS_PRESENT omp_target_is_present
+#define FTN_TARGET_MEMCPY omp_target_memcpy
+#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
+#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
+#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
+#endif
+
+#define FTN_CONTROL_TOOL omp_control_tool
+#define FTN_INIT_ALLOCATOR omp_init_allocator
+#define FTN_DESTROY_ALLOCATOR omp_destroy_allocator
+#define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator
+#define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator
+#define FTN_GET_DEVICE_NUM omp_get_device_num
+#define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format
+#define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format
+#define FTN_DISPLAY_AFFINITY omp_display_affinity
+#define FTN_CAPTURE_AFFINITY omp_capture_affinity
+#define FTN_PAUSE_RESOURCE omp_pause_resource
+#define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels
+#define FTN_FULFILL_EVENT omp_fulfill_event
+
+#endif /* KMP_FTN_PLAIN */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_APPEND
+
+#define FTN_SET_STACKSIZE kmp_set_stacksize_
+#define FTN_SET_STACKSIZE_S kmp_set_stacksize_s_
+#define FTN_GET_STACKSIZE kmp_get_stacksize_
+#define FTN_GET_STACKSIZE_S kmp_get_stacksize_s_
+#define FTN_SET_BLOCKTIME kmp_set_blocktime_
+#define FTN_GET_BLOCKTIME kmp_get_blocktime_
+#define FTN_SET_LIBRARY_SERIAL kmp_set_library_serial_
+#define FTN_SET_LIBRARY_TURNAROUND kmp_set_library_turnaround_
+#define FTN_SET_LIBRARY_THROUGHPUT kmp_set_library_throughput_
+#define FTN_SET_LIBRARY kmp_set_library_
+#define FTN_GET_LIBRARY kmp_get_library_
+#define FTN_SET_DEFAULTS kmp_set_defaults_
+#define FTN_SET_DISP_NUM_BUFFERS kmp_set_disp_num_buffers_
+#define FTN_SET_AFFINITY kmp_set_affinity_
+#define FTN_GET_AFFINITY kmp_get_affinity_
+#define FTN_GET_AFFINITY_MAX_PROC kmp_get_affinity_max_proc_
+#define FTN_CREATE_AFFINITY_MASK kmp_create_affinity_mask_
+#define FTN_DESTROY_AFFINITY_MASK kmp_destroy_affinity_mask_
+#define FTN_SET_AFFINITY_MASK_PROC kmp_set_affinity_mask_proc_
+#define FTN_UNSET_AFFINITY_MASK_PROC kmp_unset_affinity_mask_proc_
+#define FTN_GET_AFFINITY_MASK_PROC kmp_get_affinity_mask_proc_
+
+#define FTN_MALLOC kmp_malloc_
+#define FTN_ALIGNED_MALLOC kmp_aligned_malloc_
+#define FTN_CALLOC kmp_calloc_
+#define FTN_REALLOC kmp_realloc_
+#define FTN_KFREE kmp_free_
+
+#define FTN_GET_NUM_KNOWN_THREADS kmp_get_num_known_threads_
+
+#define FTN_SET_NUM_THREADS omp_set_num_threads_
+#define FTN_GET_NUM_THREADS omp_get_num_threads_
+#define FTN_GET_MAX_THREADS omp_get_max_threads_
+#define FTN_GET_THREAD_NUM omp_get_thread_num_
+#define FTN_GET_NUM_PROCS omp_get_num_procs_
+#define FTN_SET_DYNAMIC omp_set_dynamic_
+#define FTN_GET_DYNAMIC omp_get_dynamic_
+#define FTN_SET_NESTED omp_set_nested_
+#define FTN_GET_NESTED omp_get_nested_
+#define FTN_IN_PARALLEL omp_in_parallel_
+#define FTN_GET_THREAD_LIMIT omp_get_thread_limit_
+#define FTN_SET_SCHEDULE omp_set_schedule_
+#define FTN_GET_SCHEDULE omp_get_schedule_
+#define FTN_SET_MAX_ACTIVE_LEVELS omp_set_max_active_levels_
+#define FTN_GET_MAX_ACTIVE_LEVELS omp_get_max_active_levels_
+#define FTN_GET_ACTIVE_LEVEL omp_get_active_level_
+#define FTN_GET_LEVEL omp_get_level_
+#define FTN_GET_ANCESTOR_THREAD_NUM omp_get_ancestor_thread_num_
+#define FTN_GET_TEAM_SIZE omp_get_team_size_
+#define FTN_IN_FINAL omp_in_final_
+#define FTN_GET_PROC_BIND omp_get_proc_bind_
+#define FTN_GET_NUM_TEAMS omp_get_num_teams_
+#define FTN_GET_TEAM_NUM omp_get_team_num_
+#define FTN_INIT_LOCK omp_init_lock_
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT omp_init_lock_with_hint_
+#define FTN_INIT_NEST_LOCK_WITH_HINT omp_init_nest_lock_with_hint_
+#endif
+#define FTN_DESTROY_LOCK omp_destroy_lock_
+#define FTN_SET_LOCK omp_set_lock_
+#define FTN_UNSET_LOCK omp_unset_lock_
+#define FTN_TEST_LOCK omp_test_lock_
+#define FTN_INIT_NEST_LOCK omp_init_nest_lock_
+#define FTN_DESTROY_NEST_LOCK omp_destroy_nest_lock_
+#define FTN_SET_NEST_LOCK omp_set_nest_lock_
+#define FTN_UNSET_NEST_LOCK omp_unset_nest_lock_
+#define FTN_TEST_NEST_LOCK omp_test_nest_lock_
+
+#define FTN_SET_WARNINGS_ON kmp_set_warnings_on_
+#define FTN_SET_WARNINGS_OFF kmp_set_warnings_off_
+
+#define FTN_GET_WTIME omp_get_wtime_
+#define FTN_GET_WTICK omp_get_wtick_
+
+#define FTN_GET_NUM_DEVICES omp_get_num_devices_
+#define FTN_GET_DEFAULT_DEVICE omp_get_default_device_
+#define FTN_SET_DEFAULT_DEVICE omp_set_default_device_
+#define FTN_IS_INITIAL_DEVICE omp_is_initial_device_
+
+#define FTN_GET_CANCELLATION omp_get_cancellation_
+#define FTN_GET_CANCELLATION_STATUS kmp_get_cancellation_status_
+
+#define FTN_GET_MAX_TASK_PRIORITY omp_get_max_task_priority_
+#define FTN_GET_NUM_PLACES omp_get_num_places_
+#define FTN_GET_PLACE_NUM_PROCS omp_get_place_num_procs_
+#define FTN_GET_PLACE_PROC_IDS omp_get_place_proc_ids_
+#define FTN_GET_PLACE_NUM omp_get_place_num_
+#define FTN_GET_PARTITION_NUM_PLACES omp_get_partition_num_places_
+#define FTN_GET_PARTITION_PLACE_NUMS omp_get_partition_place_nums_
+#define FTN_GET_INITIAL_DEVICE omp_get_initial_device_
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC omp_target_alloc_
+#define FTN_TARGET_FREE omp_target_free_
+#define FTN_TARGET_IS_PRESENT omp_target_is_present_
+#define FTN_TARGET_MEMCPY omp_target_memcpy_
+#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect_
+#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr_
+#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr_
+#endif
+
+#define FTN_CONTROL_TOOL omp_control_tool_
+#define FTN_INIT_ALLOCATOR omp_init_allocator_
+#define FTN_DESTROY_ALLOCATOR omp_destroy_allocator_
+#define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator_
+#define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator_
+#define FTN_ALLOC omp_alloc_
+#define FTN_FREE omp_free_
+#define FTN_GET_DEVICE_NUM omp_get_device_num_
+#define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format_
+#define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format_
+#define FTN_DISPLAY_AFFINITY omp_display_affinity_
+#define FTN_CAPTURE_AFFINITY omp_capture_affinity_
+#define FTN_PAUSE_RESOURCE omp_pause_resource_
+#define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all_
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_
+#define FTN_FULFILL_EVENT omp_fulfill_event_
+
+#endif /* KMP_FTN_APPEND */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_UPPER
+
+#define FTN_SET_STACKSIZE KMP_SET_STACKSIZE
+#define FTN_SET_STACKSIZE_S KMP_SET_STACKSIZE_S
+#define FTN_GET_STACKSIZE KMP_GET_STACKSIZE
+#define FTN_GET_STACKSIZE_S KMP_GET_STACKSIZE_S
+#define FTN_SET_BLOCKTIME KMP_SET_BLOCKTIME
+#define FTN_GET_BLOCKTIME KMP_GET_BLOCKTIME
+#define FTN_SET_LIBRARY_SERIAL KMP_SET_LIBRARY_SERIAL
+#define FTN_SET_LIBRARY_TURNAROUND KMP_SET_LIBRARY_TURNAROUND
+#define FTN_SET_LIBRARY_THROUGHPUT KMP_SET_LIBRARY_THROUGHPUT
+#define FTN_SET_LIBRARY KMP_SET_LIBRARY
+#define FTN_GET_LIBRARY KMP_GET_LIBRARY
+#define FTN_SET_DEFAULTS KMP_SET_DEFAULTS
+#define FTN_SET_DISP_NUM_BUFFERS KMP_SET_DISP_NUM_BUFFERS
+#define FTN_SET_AFFINITY KMP_SET_AFFINITY
+#define FTN_GET_AFFINITY KMP_GET_AFFINITY
+#define FTN_GET_AFFINITY_MAX_PROC KMP_GET_AFFINITY_MAX_PROC
+#define FTN_CREATE_AFFINITY_MASK KMP_CREATE_AFFINITY_MASK
+#define FTN_DESTROY_AFFINITY_MASK KMP_DESTROY_AFFINITY_MASK
+#define FTN_SET_AFFINITY_MASK_PROC KMP_SET_AFFINITY_MASK_PROC
+#define FTN_UNSET_AFFINITY_MASK_PROC KMP_UNSET_AFFINITY_MASK_PROC
+#define FTN_GET_AFFINITY_MASK_PROC KMP_GET_AFFINITY_MASK_PROC
+
+#define FTN_MALLOC KMP_MALLOC
+#define FTN_ALIGNED_MALLOC KMP_ALIGNED_MALLOC
+#define FTN_CALLOC KMP_CALLOC
+#define FTN_REALLOC KMP_REALLOC
+#define FTN_KFREE KMP_FREE
+
+#define FTN_GET_NUM_KNOWN_THREADS KMP_GET_NUM_KNOWN_THREADS
+
+#define FTN_SET_NUM_THREADS OMP_SET_NUM_THREADS
+#define FTN_GET_NUM_THREADS OMP_GET_NUM_THREADS
+#define FTN_GET_MAX_THREADS OMP_GET_MAX_THREADS
+#define FTN_GET_THREAD_NUM OMP_GET_THREAD_NUM
+#define FTN_GET_NUM_PROCS OMP_GET_NUM_PROCS
+#define FTN_SET_DYNAMIC OMP_SET_DYNAMIC
+#define FTN_GET_DYNAMIC OMP_GET_DYNAMIC
+#define FTN_SET_NESTED OMP_SET_NESTED
+#define FTN_GET_NESTED OMP_GET_NESTED
+#define FTN_IN_PARALLEL OMP_IN_PARALLEL
+#define FTN_GET_THREAD_LIMIT OMP_GET_THREAD_LIMIT
+#define FTN_SET_SCHEDULE OMP_SET_SCHEDULE
+#define FTN_GET_SCHEDULE OMP_GET_SCHEDULE
+#define FTN_SET_MAX_ACTIVE_LEVELS OMP_SET_MAX_ACTIVE_LEVELS
+#define FTN_GET_MAX_ACTIVE_LEVELS OMP_GET_MAX_ACTIVE_LEVELS
+#define FTN_GET_ACTIVE_LEVEL OMP_GET_ACTIVE_LEVEL
+#define FTN_GET_LEVEL OMP_GET_LEVEL
+#define FTN_GET_ANCESTOR_THREAD_NUM OMP_GET_ANCESTOR_THREAD_NUM
+#define FTN_GET_TEAM_SIZE OMP_GET_TEAM_SIZE
+#define FTN_IN_FINAL OMP_IN_FINAL
+#define FTN_GET_PROC_BIND OMP_GET_PROC_BIND
+#define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS
+#define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM
+#define FTN_INIT_LOCK OMP_INIT_LOCK
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT OMP_INIT_LOCK_WITH_HINT
+#define FTN_INIT_NEST_LOCK_WITH_HINT OMP_INIT_NEST_LOCK_WITH_HINT
+#endif
+#define FTN_DESTROY_LOCK OMP_DESTROY_LOCK
+#define FTN_SET_LOCK OMP_SET_LOCK
+#define FTN_UNSET_LOCK OMP_UNSET_LOCK
+#define FTN_TEST_LOCK OMP_TEST_LOCK
+#define FTN_INIT_NEST_LOCK OMP_INIT_NEST_LOCK
+#define FTN_DESTROY_NEST_LOCK OMP_DESTROY_NEST_LOCK
+#define FTN_SET_NEST_LOCK OMP_SET_NEST_LOCK
+#define FTN_UNSET_NEST_LOCK OMP_UNSET_NEST_LOCK
+#define FTN_TEST_NEST_LOCK OMP_TEST_NEST_LOCK
+
+#define FTN_SET_WARNINGS_ON KMP_SET_WARNINGS_ON
+#define FTN_SET_WARNINGS_OFF KMP_SET_WARNINGS_OFF
+
+#define FTN_GET_WTIME OMP_GET_WTIME
+#define FTN_GET_WTICK OMP_GET_WTICK
+
+#define FTN_GET_NUM_DEVICES OMP_GET_NUM_DEVICES
+#define FTN_GET_DEFAULT_DEVICE OMP_GET_DEFAULT_DEVICE
+#define FTN_SET_DEFAULT_DEVICE OMP_SET_DEFAULT_DEVICE
+#define FTN_IS_INITIAL_DEVICE OMP_IS_INITIAL_DEVICE
+
+#define FTN_GET_CANCELLATION OMP_GET_CANCELLATION
+#define FTN_GET_CANCELLATION_STATUS KMP_GET_CANCELLATION_STATUS
+
+#define FTN_GET_MAX_TASK_PRIORITY OMP_GET_MAX_TASK_PRIORITY
+#define FTN_GET_NUM_PLACES OMP_GET_NUM_PLACES
+#define FTN_GET_PLACE_NUM_PROCS OMP_GET_PLACE_NUM_PROCS
+#define FTN_GET_PLACE_PROC_IDS OMP_GET_PLACE_PROC_IDS
+#define FTN_GET_PLACE_NUM OMP_GET_PLACE_NUM
+#define FTN_GET_PARTITION_NUM_PLACES OMP_GET_PARTITION_NUM_PLACES
+#define FTN_GET_PARTITION_PLACE_NUMS OMP_GET_PARTITION_PLACE_NUMS
+#define FTN_GET_INITIAL_DEVICE OMP_GET_INITIAL_DEVICE
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC OMP_TARGET_ALLOC
+#define FTN_TARGET_FREE OMP_TARGET_FREE
+#define FTN_TARGET_IS_PRESENT OMP_TARGET_IS_PRESENT
+#define FTN_TARGET_MEMCPY OMP_TARGET_MEMCPY
+#define FTN_TARGET_MEMCPY_RECT OMP_TARGET_MEMCPY_RECT
+#define FTN_TARGET_ASSOCIATE_PTR OMP_TARGET_ASSOCIATE_PTR
+#define FTN_TARGET_DISASSOCIATE_PTR OMP_TARGET_DISASSOCIATE_PTR
+#endif
+
+#define FTN_CONTROL_TOOL OMP_CONTROL_TOOL
+#define FTN_INIT_ALLOCATOR OMP_INIT_ALLOCATOR
+#define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR
+#define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR
+#define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR
+#define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM
+#define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT
+#define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT
+#define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY
+#define FTN_CAPTURE_AFFINITY OMP_CAPTURE_AFFINITY
+#define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE
+#define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS
+#define FTN_FULFILL_EVENT OMP_FULFILL_EVENT
+
+#endif /* KMP_FTN_UPPER */
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_FTN_ENTRIES == KMP_FTN_UAPPEND
+
+#define FTN_SET_STACKSIZE KMP_SET_STACKSIZE_
+#define FTN_SET_STACKSIZE_S KMP_SET_STACKSIZE_S_
+#define FTN_GET_STACKSIZE KMP_GET_STACKSIZE_
+#define FTN_GET_STACKSIZE_S KMP_GET_STACKSIZE_S_
+#define FTN_SET_BLOCKTIME KMP_SET_BLOCKTIME_
+#define FTN_GET_BLOCKTIME KMP_GET_BLOCKTIME_
+#define FTN_SET_LIBRARY_SERIAL KMP_SET_LIBRARY_SERIAL_
+#define FTN_SET_LIBRARY_TURNAROUND KMP_SET_LIBRARY_TURNAROUND_
+#define FTN_SET_LIBRARY_THROUGHPUT KMP_SET_LIBRARY_THROUGHPUT_
+#define FTN_SET_LIBRARY KMP_SET_LIBRARY_
+#define FTN_GET_LIBRARY KMP_GET_LIBRARY_
+#define FTN_SET_DEFAULTS KMP_SET_DEFAULTS_
+#define FTN_SET_DISP_NUM_BUFFERS KMP_SET_DISP_NUM_BUFFERS_
+#define FTN_SET_AFFINITY KMP_SET_AFFINITY_
+#define FTN_GET_AFFINITY KMP_GET_AFFINITY_
+#define FTN_GET_AFFINITY_MAX_PROC KMP_GET_AFFINITY_MAX_PROC_
+#define FTN_CREATE_AFFINITY_MASK KMP_CREATE_AFFINITY_MASK_
+#define FTN_DESTROY_AFFINITY_MASK KMP_DESTROY_AFFINITY_MASK_
+#define FTN_SET_AFFINITY_MASK_PROC KMP_SET_AFFINITY_MASK_PROC_
+#define FTN_UNSET_AFFINITY_MASK_PROC KMP_UNSET_AFFINITY_MASK_PROC_
+#define FTN_GET_AFFINITY_MASK_PROC KMP_GET_AFFINITY_MASK_PROC_
+
+#define FTN_MALLOC KMP_MALLOC_
+#define FTN_ALIGNED_MALLOC KMP_ALIGNED_MALLOC_
+#define FTN_CALLOC KMP_CALLOC_
+#define FTN_REALLOC KMP_REALLOC_
+#define FTN_KFREE KMP_FREE_
+
+#define FTN_GET_NUM_KNOWN_THREADS KMP_GET_NUM_KNOWN_THREADS_
+
+#define FTN_SET_NUM_THREADS OMP_SET_NUM_THREADS_
+#define FTN_GET_NUM_THREADS OMP_GET_NUM_THREADS_
+#define FTN_GET_MAX_THREADS OMP_GET_MAX_THREADS_
+#define FTN_GET_THREAD_NUM OMP_GET_THREAD_NUM_
+#define FTN_GET_NUM_PROCS OMP_GET_NUM_PROCS_
+#define FTN_SET_DYNAMIC OMP_SET_DYNAMIC_
+#define FTN_GET_DYNAMIC OMP_GET_DYNAMIC_
+#define FTN_SET_NESTED OMP_SET_NESTED_
+#define FTN_GET_NESTED OMP_GET_NESTED_
+#define FTN_IN_PARALLEL OMP_IN_PARALLEL_
+#define FTN_GET_THREAD_LIMIT OMP_GET_THREAD_LIMIT_
+#define FTN_SET_SCHEDULE OMP_SET_SCHEDULE_
+#define FTN_GET_SCHEDULE OMP_GET_SCHEDULE_
+#define FTN_SET_MAX_ACTIVE_LEVELS OMP_SET_MAX_ACTIVE_LEVELS_
+#define FTN_GET_MAX_ACTIVE_LEVELS OMP_GET_MAX_ACTIVE_LEVELS_
+#define FTN_GET_ACTIVE_LEVEL OMP_GET_ACTIVE_LEVEL_
+#define FTN_GET_LEVEL OMP_GET_LEVEL_
+#define FTN_GET_ANCESTOR_THREAD_NUM OMP_GET_ANCESTOR_THREAD_NUM_
+#define FTN_GET_TEAM_SIZE OMP_GET_TEAM_SIZE_
+#define FTN_IN_FINAL OMP_IN_FINAL_
+#define FTN_GET_PROC_BIND OMP_GET_PROC_BIND_
+#define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS_
+#define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM_
+#define FTN_INIT_LOCK OMP_INIT_LOCK_
+#if KMP_USE_DYNAMIC_LOCK
+#define FTN_INIT_LOCK_WITH_HINT OMP_INIT_LOCK_WITH_HINT_
+#define FTN_INIT_NEST_LOCK_WITH_HINT OMP_INIT_NEST_LOCK_WITH_HINT_
+#endif
+#define FTN_DESTROY_LOCK OMP_DESTROY_LOCK_
+#define FTN_SET_LOCK OMP_SET_LOCK_
+#define FTN_UNSET_LOCK OMP_UNSET_LOCK_
+#define FTN_TEST_LOCK OMP_TEST_LOCK_
+#define FTN_INIT_NEST_LOCK OMP_INIT_NEST_LOCK_
+#define FTN_DESTROY_NEST_LOCK OMP_DESTROY_NEST_LOCK_
+#define FTN_SET_NEST_LOCK OMP_SET_NEST_LOCK_
+#define FTN_UNSET_NEST_LOCK OMP_UNSET_NEST_LOCK_
+#define FTN_TEST_NEST_LOCK OMP_TEST_NEST_LOCK_
+
+#define FTN_SET_WARNINGS_ON KMP_SET_WARNINGS_ON_
+#define FTN_SET_WARNINGS_OFF KMP_SET_WARNINGS_OFF_
+
+#define FTN_GET_WTIME OMP_GET_WTIME_
+#define FTN_GET_WTICK OMP_GET_WTICK_
+
+#define FTN_GET_NUM_DEVICES OMP_GET_NUM_DEVICES_
+#define FTN_GET_DEFAULT_DEVICE OMP_GET_DEFAULT_DEVICE_
+#define FTN_SET_DEFAULT_DEVICE OMP_SET_DEFAULT_DEVICE_
+#define FTN_IS_INITIAL_DEVICE OMP_IS_INITIAL_DEVICE_
+
+#define FTN_GET_CANCELLATION OMP_GET_CANCELLATION_
+#define FTN_GET_CANCELLATION_STATUS KMP_GET_CANCELLATION_STATUS_
+
+#define FTN_GET_MAX_TASK_PRIORITY OMP_GET_MAX_TASK_PRIORITY_
+#define FTN_GET_NUM_PLACES OMP_GET_NUM_PLACES_
+#define FTN_GET_PLACE_NUM_PROCS OMP_GET_PLACE_NUM_PROCS_
+#define FTN_GET_PLACE_PROC_IDS OMP_GET_PLACE_PROC_IDS_
+#define FTN_GET_PLACE_NUM OMP_GET_PLACE_NUM_
+#define FTN_GET_PARTITION_NUM_PLACES OMP_GET_PARTITION_NUM_PLACES_
+#define FTN_GET_PARTITION_PLACE_NUMS OMP_GET_PARTITION_PLACE_NUMS_
+#define FTN_GET_INITIAL_DEVICE OMP_GET_INITIAL_DEVICE_
+#ifdef KMP_STUB
+#define FTN_TARGET_ALLOC OMP_TARGET_ALLOC_
+#define FTN_TARGET_FREE OMP_TARGET_FREE_
+#define FTN_TARGET_IS_PRESENT OMP_TARGET_IS_PRESENT_
+#define FTN_TARGET_MEMCPY OMP_TARGET_MEMCPY_
+#define FTN_TARGET_MEMCPY_RECT OMP_TARGET_MEMCPY_RECT_
+#define FTN_TARGET_ASSOCIATE_PTR OMP_TARGET_ASSOCIATE_PTR_
+#define FTN_TARGET_DISASSOCIATE_PTR OMP_TARGET_DISASSOCIATE_PTR_
+#endif
+
+#define FTN_CONTROL_TOOL OMP_CONTROL_TOOL_
+#define FTN_INIT_ALLOCATOR OMP_INIT_ALLOCATOR_
+#define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR_
+#define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR_
+#define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR_
+#define FTN_ALLOC OMP_ALLOC_
+#define FTN_FREE OMP_FREE_
+#define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM_
+#define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT_
+#define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT_
+#define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY_
+#define FTN_CAPTURE_AFFINITY OMP_CAPTURE_AFFINITY_
+#define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE_
+#define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL_
+#define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_
+#define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_
+
+#endif /* KMP_FTN_UAPPEND */
+
+/* -------------------------- GOMP API NAMES ------------------------ */
+// All GOMP_1.0 symbols
+#define KMP_API_NAME_GOMP_ATOMIC_END GOMP_atomic_end
+#define KMP_API_NAME_GOMP_ATOMIC_START GOMP_atomic_start
+#define KMP_API_NAME_GOMP_BARRIER GOMP_barrier
+#define KMP_API_NAME_GOMP_CRITICAL_END GOMP_critical_end
+#define KMP_API_NAME_GOMP_CRITICAL_NAME_END GOMP_critical_name_end
+#define KMP_API_NAME_GOMP_CRITICAL_NAME_START GOMP_critical_name_start
+#define KMP_API_NAME_GOMP_CRITICAL_START GOMP_critical_start
+#define KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT GOMP_loop_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_DYNAMIC_START GOMP_loop_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_END GOMP_loop_end
+#define KMP_API_NAME_GOMP_LOOP_END_NOWAIT GOMP_loop_end_nowait
+#define KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT GOMP_loop_guided_next
+#define KMP_API_NAME_GOMP_LOOP_GUIDED_START GOMP_loop_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT                            \
+  GOMP_loop_ordered_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START                           \
+  GOMP_loop_ordered_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT GOMP_loop_ordered_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START                            \
+  GOMP_loop_ordered_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT                            \
+  GOMP_loop_ordered_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START                           \
+  GOMP_loop_ordered_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT GOMP_loop_ordered_static_next
+#define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START                            \
+  GOMP_loop_ordered_static_start
+#define KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT GOMP_loop_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_RUNTIME_START GOMP_loop_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_STATIC_NEXT GOMP_loop_static_next
+#define KMP_API_NAME_GOMP_LOOP_STATIC_START GOMP_loop_static_start
+#define KMP_API_NAME_GOMP_ORDERED_END GOMP_ordered_end
+#define KMP_API_NAME_GOMP_ORDERED_START GOMP_ordered_start
+#define KMP_API_NAME_GOMP_PARALLEL_END GOMP_parallel_end
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START                          \
+  GOMP_parallel_loop_dynamic_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START                           \
+  GOMP_parallel_loop_guided_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START                          \
+  GOMP_parallel_loop_runtime_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START                           \
+  GOMP_parallel_loop_static_start
+#define KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START GOMP_parallel_sections_start
+#define KMP_API_NAME_GOMP_PARALLEL_START GOMP_parallel_start
+#define KMP_API_NAME_GOMP_SECTIONS_END GOMP_sections_end
+#define KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT GOMP_sections_end_nowait
+#define KMP_API_NAME_GOMP_SECTIONS_NEXT GOMP_sections_next
+#define KMP_API_NAME_GOMP_SECTIONS_START GOMP_sections_start
+#define KMP_API_NAME_GOMP_SINGLE_COPY_END GOMP_single_copy_end
+#define KMP_API_NAME_GOMP_SINGLE_COPY_START GOMP_single_copy_start
+#define KMP_API_NAME_GOMP_SINGLE_START GOMP_single_start
+
+// All GOMP_2.0 symbols
+#define KMP_API_NAME_GOMP_TASK GOMP_task
+#define KMP_API_NAME_GOMP_TASKWAIT GOMP_taskwait
+#define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT GOMP_loop_ull_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START GOMP_loop_ull_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT GOMP_loop_ull_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START GOMP_loop_ull_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT                        \
+  GOMP_loop_ull_ordered_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START                       \
+  GOMP_loop_ull_ordered_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT                         \
+  GOMP_loop_ull_ordered_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START                        \
+  GOMP_loop_ull_ordered_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT                        \
+  GOMP_loop_ull_ordered_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START                       \
+  GOMP_loop_ull_ordered_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT                         \
+  GOMP_loop_ull_ordered_static_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START                        \
+  GOMP_loop_ull_ordered_static_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT GOMP_loop_ull_runtime_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START GOMP_loop_ull_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT GOMP_loop_ull_static_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START GOMP_loop_ull_static_start
+
+// All GOMP_3.0 symbols
+#define KMP_API_NAME_GOMP_TASKYIELD GOMP_taskyield
+
+// All GOMP_4.0 symbols
+#define KMP_API_NAME_GOMP_BARRIER_CANCEL GOMP_barrier_cancel
+#define KMP_API_NAME_GOMP_CANCEL GOMP_cancel
+#define KMP_API_NAME_GOMP_CANCELLATION_POINT GOMP_cancellation_point
+#define KMP_API_NAME_GOMP_LOOP_END_CANCEL GOMP_loop_end_cancel
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC GOMP_parallel_loop_dynamic
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED GOMP_parallel_loop_guided
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME GOMP_parallel_loop_runtime
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC GOMP_parallel_loop_static
+#define KMP_API_NAME_GOMP_PARALLEL_SECTIONS GOMP_parallel_sections
+#define KMP_API_NAME_GOMP_PARALLEL GOMP_parallel
+#define KMP_API_NAME_GOMP_SECTIONS_END_CANCEL GOMP_sections_end_cancel
+#define KMP_API_NAME_GOMP_TASKGROUP_START GOMP_taskgroup_start
+#define KMP_API_NAME_GOMP_TASKGROUP_END GOMP_taskgroup_end
+/* Target functions should be taken care of by liboffload */
+#define KMP_API_NAME_GOMP_TARGET GOMP_target
+#define KMP_API_NAME_GOMP_TARGET_DATA GOMP_target_data
+#define KMP_API_NAME_GOMP_TARGET_END_DATA GOMP_target_end_data
+#define KMP_API_NAME_GOMP_TARGET_UPDATE GOMP_target_update
+#define KMP_API_NAME_GOMP_TEAMS GOMP_teams
+
+// All GOMP_4.5 symbols
+#define KMP_API_NAME_GOMP_TASKLOOP GOMP_taskloop
+#define KMP_API_NAME_GOMP_TASKLOOP_ULL GOMP_taskloop_ull
+#define KMP_API_NAME_GOMP_DOACROSS_POST GOMP_doacross_post
+#define KMP_API_NAME_GOMP_DOACROSS_WAIT GOMP_doacross_wait
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START                           \
+  GOMP_loop_doacross_static_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START                          \
+  GOMP_loop_doacross_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START                           \
+  GOMP_loop_doacross_guided_start
+#define KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START                          \
+  GOMP_loop_doacross_runtime_start
+#define KMP_API_NAME_GOMP_DOACROSS_ULL_POST GOMP_doacross_ull_post
+#define KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT GOMP_doacross_ull_wait
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START                       \
+  GOMP_loop_ull_doacross_static_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START                      \
+  GOMP_loop_ull_doacross_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START                       \
+  GOMP_loop_ull_doacross_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START                      \
+  GOMP_loop_ull_doacross_runtime_start
+
+#endif /* KMP_FTN_OS_H */
diff --git a/final/runtime/src/kmp_ftn_stdcall.cpp b/final/runtime/src/kmp_ftn_stdcall.cpp
new file mode 100644
index 0000000..174c219
--- /dev/null
+++ b/final/runtime/src/kmp_ftn_stdcall.cpp
@@ -0,0 +1,32 @@
+/*
+ * kmp_ftn_stdcall.cpp -- Fortran __stdcall linkage support for OpenMP.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+
+// Note: This string is not printed when KMP_VERSION=1.
+char const __kmp_version_ftnstdcall[] =
+    KMP_VERSION_PREFIX "Fortran __stdcall OMP support: "
+#ifdef USE_FTN_STDCALL
+                       "yes";
+#else
+                       "no";
+#endif
+
+#ifdef USE_FTN_STDCALL
+
+#define FTN_STDCALL KMP_STDCALL
+#define KMP_FTN_ENTRIES USE_FTN_STDCALL
+
+#include "kmp_ftn_entry.h"
+#include "kmp_ftn_os.h"
+
+#endif /* USE_FTN_STDCALL */
diff --git a/final/runtime/src/kmp_global.cpp b/final/runtime/src/kmp_global.cpp
new file mode 100644
index 0000000..1ec73b8
--- /dev/null
+++ b/final/runtime/src/kmp_global.cpp
@@ -0,0 +1,534 @@
+/*
+ * kmp_global.cpp -- KPTS global variables for runtime support library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+
+kmp_key_t __kmp_gtid_threadprivate_key;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+kmp_cpuinfo_t __kmp_cpuinfo = {0}; // Not initialized
+#endif
+
+#if KMP_STATS_ENABLED
+#include "kmp_stats.h"
+// lock for modifying the global __kmp_stats_list
+kmp_tas_lock_t __kmp_stats_lock;
+
+// global list of per thread stats, the head is a sentinel node which
+// accumulates all stats produced before __kmp_create_worker is called.
+kmp_stats_list *__kmp_stats_list;
+
+// thread local pointer to stats node within list
+KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr = NULL;
+
+// gives reference tick for all events (considered the 0 tick)
+tsc_tick_count __kmp_stats_start_time;
+#endif
+
+/* ----------------------------------------------------- */
+/* INITIALIZATION VARIABLES */
+/* they are syncronized to write during init, but read anytime */
+volatile int __kmp_init_serial = FALSE;
+volatile int __kmp_init_gtid = FALSE;
+volatile int __kmp_init_common = FALSE;
+volatile int __kmp_init_middle = FALSE;
+volatile int __kmp_init_parallel = FALSE;
+#if KMP_USE_MONITOR
+volatile int __kmp_init_monitor =
+    0; /* 1 - launched, 2 - actually started (Windows* OS only) */
+#endif
+volatile int __kmp_init_user_locks = FALSE;
+
+/* list of address of allocated caches for commons */
+kmp_cached_addr_t *__kmp_threadpriv_cache_list = NULL;
+
+int __kmp_init_counter = 0;
+int __kmp_root_counter = 0;
+int __kmp_version = 0;
+
+std::atomic<kmp_int32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
+std::atomic<kmp_int32> __kmp_task_counter = ATOMIC_VAR_INIT(0);
+
+size_t __kmp_stksize = KMP_DEFAULT_STKSIZE;
+#if KMP_USE_MONITOR
+size_t __kmp_monitor_stksize = 0; // auto adjust
+#endif
+size_t __kmp_stkoffset = KMP_DEFAULT_STKOFFSET;
+int __kmp_stkpadding = KMP_MIN_STKPADDING;
+
+size_t __kmp_malloc_pool_incr = KMP_DEFAULT_MALLOC_POOL_INCR;
+
+// Barrier method defaults, settings, and strings.
+// branch factor = 2^branch_bits (only relevant for tree & hyper barrier types)
+kmp_uint32 __kmp_barrier_gather_bb_dflt = 2;
+/* branch_factor = 4 */ /* hyper2: C78980 */
+kmp_uint32 __kmp_barrier_release_bb_dflt = 2;
+/* branch_factor = 4 */ /* hyper2: C78980 */
+
+kmp_bar_pat_e __kmp_barrier_gather_pat_dflt = bp_hyper_bar;
+/* hyper2: C78980 */
+kmp_bar_pat_e __kmp_barrier_release_pat_dflt = bp_hyper_bar;
+/* hyper2: C78980 */
+
+kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier] = {0};
+kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier] = {0};
+kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier] = {bp_linear_bar};
+kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier] = {bp_linear_bar};
+char const *__kmp_barrier_branch_bit_env_name[bs_last_barrier] = {
+    "KMP_PLAIN_BARRIER", "KMP_FORKJOIN_BARRIER"
+#if KMP_FAST_REDUCTION_BARRIER
+    ,
+    "KMP_REDUCTION_BARRIER"
+#endif // KMP_FAST_REDUCTION_BARRIER
+};
+char const *__kmp_barrier_pattern_env_name[bs_last_barrier] = {
+    "KMP_PLAIN_BARRIER_PATTERN", "KMP_FORKJOIN_BARRIER_PATTERN"
+#if KMP_FAST_REDUCTION_BARRIER
+    ,
+    "KMP_REDUCTION_BARRIER_PATTERN"
+#endif // KMP_FAST_REDUCTION_BARRIER
+};
+char const *__kmp_barrier_type_name[bs_last_barrier] = {"plain", "forkjoin"
+#if KMP_FAST_REDUCTION_BARRIER
+                                                        ,
+                                                        "reduction"
+#endif // KMP_FAST_REDUCTION_BARRIER
+};
+char const *__kmp_barrier_pattern_name[bp_last_bar] = {"linear", "tree",
+                                                       "hyper", "hierarchical"};
+
+int __kmp_allThreadsSpecified = 0;
+size_t __kmp_align_alloc = CACHE_LINE;
+
+int __kmp_generate_warnings = kmp_warnings_low;
+int __kmp_reserve_warn = 0;
+int __kmp_xproc = 0;
+int __kmp_avail_proc = 0;
+size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
+int __kmp_sys_max_nth = KMP_MAX_NTH;
+int __kmp_max_nth = 0;
+int __kmp_cg_max_nth = 0;
+int __kmp_teams_max_nth = 0;
+int __kmp_threads_capacity = 0;
+int __kmp_dflt_team_nth = 0;
+int __kmp_dflt_team_nth_ub = 0;
+int __kmp_tp_capacity = 0;
+int __kmp_tp_cached = 0;
+int __kmp_dispatch_num_buffers = KMP_DFLT_DISP_NUM_BUFF;
+int __kmp_dflt_max_active_levels = 1; // Nesting off by default
+bool __kmp_dflt_max_active_levels_set = false; // Don't override set value
+#if KMP_NESTED_HOT_TEAMS
+int __kmp_hot_teams_mode = 0; /* 0 - free extra threads when reduced */
+/* 1 - keep extra threads when reduced */
+int __kmp_hot_teams_max_level = 1; /* nesting level of hot teams */
+#endif
+enum library_type __kmp_library = library_none;
+enum sched_type __kmp_sched =
+    kmp_sch_default; /* scheduling method for runtime scheduling */
+enum sched_type __kmp_static =
+    kmp_sch_static_greedy; /* default static scheduling method */
+enum sched_type __kmp_guided =
+    kmp_sch_guided_iterative_chunked; /* default guided scheduling method */
+enum sched_type __kmp_auto =
+    kmp_sch_guided_analytical_chunked; /* default auto scheduling method */
+#if KMP_USE_HIER_SCHED
+int __kmp_dispatch_hand_threading = 0;
+int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL};
+#endif
+int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+#if KMP_USE_MONITOR
+int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
+int __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(KMP_DEFAULT_BLOCKTIME,
+                                                      KMP_MIN_MONITOR_WAKEUPS);
+#endif
+#ifdef KMP_ADJUST_BLOCKTIME
+int __kmp_zero_bt = FALSE;
+#endif /* KMP_ADJUST_BLOCKTIME */
+#ifdef KMP_DFLT_NTH_CORES
+int __kmp_ncores = 0;
+#endif
+int __kmp_chunk = 0;
+int __kmp_abort_delay = 0;
+#if KMP_OS_LINUX && defined(KMP_TDATA_GTID)
+int __kmp_gtid_mode = 3; /* use __declspec(thread) TLS to store gtid */
+int __kmp_adjust_gtid_mode = FALSE;
+#elif KMP_OS_WINDOWS
+int __kmp_gtid_mode = 2; /* use TLS functions to store gtid */
+int __kmp_adjust_gtid_mode = FALSE;
+#else
+int __kmp_gtid_mode = 0; /* select method to get gtid based on #threads */
+int __kmp_adjust_gtid_mode = TRUE;
+#endif /* KMP_OS_LINUX && defined(KMP_TDATA_GTID) */
+#ifdef KMP_TDATA_GTID
+KMP_THREAD_LOCAL int __kmp_gtid = KMP_GTID_DNE;
+#endif /* KMP_TDATA_GTID */
+int __kmp_tls_gtid_min = INT_MAX;
+int __kmp_foreign_tp = TRUE;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+int __kmp_inherit_fp_control = TRUE;
+kmp_int16 __kmp_init_x87_fpu_control_word = 0;
+kmp_uint32 __kmp_init_mxcsr = 0;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef USE_LOAD_BALANCE
+double __kmp_load_balance_interval = 1.0;
+#endif /* USE_LOAD_BALANCE */
+
+kmp_nested_nthreads_t __kmp_nested_nth = {NULL, 0, 0};
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params = {
+    1, 1024}; // TODO: tune it!
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+const char *__kmp_speculative_statsfile = "-";
+#endif
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+int __kmp_display_env = FALSE;
+int __kmp_display_env_verbose = FALSE;
+int __kmp_omp_cancellation = FALSE;
+
+/* map OMP 3.0 schedule types with our internal schedule types */
+enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext +
+                              kmp_sched_upper_std - kmp_sched_lower - 2] = {
+    kmp_sch_static_chunked, // ==> kmp_sched_static            = 1
+    kmp_sch_dynamic_chunked, // ==> kmp_sched_dynamic           = 2
+    kmp_sch_guided_chunked, // ==> kmp_sched_guided            = 3
+    kmp_sch_auto, // ==> kmp_sched_auto              = 4
+    kmp_sch_trapezoidal // ==> kmp_sched_trapezoidal       = 101
+    // will likely not be used, introduced here just to debug the code
+    // of public intel extension schedules
+};
+
+#if KMP_OS_LINUX
+enum clock_function_type __kmp_clock_function;
+int __kmp_clock_function_param;
+#endif /* KMP_OS_LINUX */
+
+#if KMP_MIC_SUPPORTED
+enum mic_type __kmp_mic_type = non_mic;
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+
+KMPAffinity *__kmp_affinity_dispatch = NULL;
+
+#if KMP_USE_HWLOC
+int __kmp_hwloc_error = FALSE;
+hwloc_topology_t __kmp_hwloc_topology = NULL;
+int __kmp_numa_detected = FALSE;
+int __kmp_tile_depth = 0;
+#endif
+
+#if KMP_OS_WINDOWS
+#if KMP_GROUP_AFFINITY
+int __kmp_num_proc_groups = 1;
+#endif /* KMP_GROUP_AFFINITY */
+kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount = NULL;
+kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount = NULL;
+kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity = NULL;
+kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity = NULL;
+#endif /* KMP_OS_WINDOWS */
+
+size_t __kmp_affin_mask_size = 0;
+enum affinity_type __kmp_affinity_type = affinity_default;
+enum affinity_gran __kmp_affinity_gran = affinity_gran_default;
+int __kmp_affinity_gran_levels = -1;
+int __kmp_affinity_dups = TRUE;
+enum affinity_top_method __kmp_affinity_top_method =
+    affinity_top_method_default;
+int __kmp_affinity_compact = 0;
+int __kmp_affinity_offset = 0;
+int __kmp_affinity_verbose = FALSE;
+int __kmp_affinity_warnings = TRUE;
+int __kmp_affinity_respect_mask = affinity_respect_mask_default;
+char *__kmp_affinity_proclist = NULL;
+kmp_affin_mask_t *__kmp_affinity_masks = NULL;
+unsigned __kmp_affinity_num_masks = 0;
+
+char *__kmp_cpuinfo_file = NULL;
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};
+int __kmp_affinity_num_places = 0;
+int __kmp_display_affinity = FALSE;
+char *__kmp_affinity_format = NULL;
+
+kmp_hws_item_t __kmp_hws_socket = {0, 0};
+kmp_hws_item_t __kmp_hws_node = {0, 0};
+kmp_hws_item_t __kmp_hws_tile = {0, 0};
+kmp_hws_item_t __kmp_hws_core = {0, 0};
+kmp_hws_item_t __kmp_hws_proc = {0, 0};
+int __kmp_hws_requested = 0;
+int __kmp_hws_abs_flag = 0; // absolute or per-item number requested
+
+kmp_int32 __kmp_default_device = 0;
+
+kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
+kmp_int32 __kmp_max_task_priority = 0;
+kmp_uint64 __kmp_taskloop_min_tasks = 0;
+
+int __kmp_memkind_available = 0;
+omp_allocator_handle_t const omp_null_allocator = NULL;
+omp_allocator_handle_t const omp_default_mem_alloc =
+    (omp_allocator_handle_t const)1;
+omp_allocator_handle_t const omp_large_cap_mem_alloc =
+    (omp_allocator_handle_t const)2;
+omp_allocator_handle_t const omp_const_mem_alloc =
+    (omp_allocator_handle_t const)3;
+omp_allocator_handle_t const omp_high_bw_mem_alloc =
+    (omp_allocator_handle_t const)4;
+omp_allocator_handle_t const omp_low_lat_mem_alloc =
+    (omp_allocator_handle_t const)5;
+omp_allocator_handle_t const omp_cgroup_mem_alloc =
+    (omp_allocator_handle_t const)6;
+omp_allocator_handle_t const omp_pteam_mem_alloc =
+    (omp_allocator_handle_t const)7;
+omp_allocator_handle_t const omp_thread_mem_alloc =
+    (omp_allocator_handle_t const)8;
+omp_allocator_handle_t const kmp_max_mem_alloc =
+    (omp_allocator_handle_t const)1024;
+omp_allocator_handle_t __kmp_def_allocator = omp_default_mem_alloc;
+
+omp_memspace_handle_t const omp_default_mem_space =
+    (omp_memspace_handle_t const)0;
+omp_memspace_handle_t const omp_large_cap_mem_space =
+    (omp_memspace_handle_t const)1;
+omp_memspace_handle_t const omp_const_mem_space =
+    (omp_memspace_handle_t const)2;
+omp_memspace_handle_t const omp_high_bw_mem_space =
+    (omp_memspace_handle_t const)3;
+omp_memspace_handle_t const omp_low_lat_mem_space =
+    (omp_memspace_handle_t const)4;
+
+/* This check ensures that the compiler is passing the correct data type for the
+   flags formal parameter of the function kmpc_omp_task_alloc(). If the type is
+   not a 4-byte type, then give an error message about a non-positive length
+   array pointing here.  If that happens, the kmp_tasking_flags_t structure must
+   be redefined to have exactly 32 bits. */
+KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4);
+
+int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
+int __kmp_enable_task_throttling = 1;
+
+#ifdef DEBUG_SUSPEND
+int __kmp_suspend_count = 0;
+#endif
+
+int __kmp_settings = FALSE;
+int __kmp_duplicate_library_ok = 0;
+#if USE_ITT_BUILD
+int __kmp_forkjoin_frames = 1;
+int __kmp_forkjoin_frames_mode = 3;
+#endif
+PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method =
+    reduction_method_not_defined;
+int __kmp_determ_red = FALSE;
+
+#ifdef KMP_DEBUG
+int kmp_a_debug = 0;
+int kmp_b_debug = 0;
+int kmp_c_debug = 0;
+int kmp_d_debug = 0;
+int kmp_e_debug = 0;
+int kmp_f_debug = 0;
+int kmp_diag = 0;
+#endif
+
+/* For debug information logging using rotating buffer */
+int __kmp_debug_buf =
+    FALSE; /* TRUE means use buffer, FALSE means print to stderr */
+int __kmp_debug_buf_lines =
+    KMP_DEBUG_BUF_LINES_INIT; /* Lines of debug stored in buffer */
+int __kmp_debug_buf_chars =
+    KMP_DEBUG_BUF_CHARS_INIT; /* Characters allowed per line in buffer */
+int __kmp_debug_buf_atomic =
+    FALSE; /* TRUE means use atomic update of buffer entry pointer */
+
+char *__kmp_debug_buffer = NULL; /* Debug buffer itself */
+std::atomic<int> __kmp_debug_count =
+    ATOMIC_VAR_INIT(0); /* number of lines printed in buffer so far */
+int __kmp_debug_buf_warn_chars =
+    0; /* Keep track of char increase recommended in warnings */
+/* end rotating debug buffer */
+
+#ifdef KMP_DEBUG
+int __kmp_par_range; /* +1 => only go par for constructs in range */
+/* -1 => only go par for constructs outside range */
+char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN] = {'\0'};
+char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN] = {'\0'};
+int __kmp_par_range_lb = 0;
+int __kmp_par_range_ub = INT_MAX;
+#endif /* KMP_DEBUG */
+
+/* For printing out dynamic storage map for threads and teams */
+int __kmp_storage_map =
+    FALSE; /* True means print storage map for threads and teams */
+int __kmp_storage_map_verbose =
+    FALSE; /* True means storage map includes placement info */
+int __kmp_storage_map_verbose_specified = FALSE;
+/* Initialize the library data structures when we fork a child process, defaults
+ * to TRUE */
+int __kmp_need_register_atfork =
+    TRUE; /* At initialization, call pthread_atfork to install fork handler */
+int __kmp_need_register_atfork_specified = TRUE;
+
+int __kmp_env_stksize = FALSE; /* KMP_STACKSIZE specified? */
+int __kmp_env_blocktime = FALSE; /* KMP_BLOCKTIME specified? */
+int __kmp_env_checks = FALSE; /* KMP_CHECKS specified?    */
+int __kmp_env_consistency_check = FALSE; /* KMP_CONSISTENCY_CHECK specified? */
+
+// From KMP_USE_YIELD:
+// 0 = never yield;
+// 1 = always yield (default);
+// 2 = yield only if oversubscribed
+kmp_int32 __kmp_use_yield = 1;
+// This will be 1 if KMP_USE_YIELD environment variable was set explicitly
+kmp_int32 __kmp_use_yield_exp_set = 0;
+
+kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
+kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
+
+/* ------------------------------------------------------ */
+/* STATE mostly syncronized with global lock */
+/* data written to rarely by masters, read often by workers */
+/* TODO: None of this global padding stuff works consistently because the order
+   of declaration is not necessarily correlated to storage order. To fix this,
+   all the important globals must be put in a big structure instead. */
+KMP_ALIGN_CACHE
+kmp_info_t **__kmp_threads = NULL;
+kmp_root_t **__kmp_root = NULL;
+
+/* data read/written to often by masters */
+KMP_ALIGN_CACHE
+volatile int __kmp_nth = 0;
+volatile int __kmp_all_nth = 0;
+volatile kmp_info_t *__kmp_thread_pool = NULL;
+volatile kmp_team_t *__kmp_team_pool = NULL;
+
+KMP_ALIGN_CACHE
+std::atomic<int> __kmp_thread_pool_active_nth = ATOMIC_VAR_INIT(0);
+
+/* -------------------------------------------------
+ * GLOBAL/ROOT STATE */
+KMP_ALIGN_CACHE
+kmp_global_t __kmp_global = {{0}};
+
+/* ----------------------------------------------- */
+/* GLOBAL SYNCHRONIZATION LOCKS */
+/* TODO verify the need for these locks and if they need to be global */
+
+#if KMP_USE_INTERNODE_ALIGNMENT
+/* Multinode systems have larger cache line granularity which can cause
+ * false sharing if the alignment is not large enough for these locks */
+KMP_ALIGN_CACHE_INTERNODE
+
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
+#if KMP_USE_MONITOR
+/* control monitor thread creation */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
+#endif
+/* used for the hack to allow threadprivate cache and __kmp_threads expansion
+   to co-exist */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
+
+KMP_ALIGN_CACHE_INTERNODE
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
+KMP_ALIGN_CACHE_INTERNODE
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
+#else
+KMP_ALIGN_CACHE
+
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
+#if KMP_USE_MONITOR
+/* control monitor thread creation */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
+#endif
+/* used for the hack to allow threadprivate cache and __kmp_threads expansion
+   to co-exist */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
+
+KMP_ALIGN(128)
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
+KMP_ALIGN(128)
+kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
+KMP_ALIGN(128)
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
+#endif
+
+/* ----------------------------------------------- */
+
+#if KMP_HANDLE_SIGNALS
+/* Signal handling is disabled by default, because it confuses users: In case of
+   sigsegv (or other trouble) in user code signal handler catches the signal,
+   which then "appears" in the monitor thread (when the monitor executes raise()
+   function). Users see signal in the monitor thread and blame OpenMP RTL.
+
+   Grant said signal handling required on some older OSes (Irix?) supported by
+   KAI, because bad applications hung but not aborted. Currently it is not a
+   problem for Linux* OS, OS X* and Windows* OS.
+
+   Grant: Found new hangs for EL4, EL5, and a Fedora Core machine.  So I'm
+   putting the default back for now to see if that fixes hangs on those
+   machines.
+
+   2010-04013 Lev: It was a bug in Fortran RTL. Fortran RTL prints a kind of
+   stack backtrace when program is aborting, but the code is not signal-safe.
+   When multiple signals raised at the same time (which occurs in dynamic
+   negative tests because all the worker threads detects the same error),
+   Fortran RTL may hang. The bug finally fixed in Fortran RTL library provided
+   by Steve R., and will be available soon. */
+int __kmp_handle_signals = FALSE;
+#endif
+
+#ifdef DEBUG_SUSPEND
+int get_suspend_count_(void) {
+  int count = __kmp_suspend_count;
+  __kmp_suspend_count = 0;
+  return count;
+}
+void set_suspend_count_(int *value) { __kmp_suspend_count = *value; }
+#endif
+
+// Symbols for MS mutual detection.
+int _You_must_link_with_exactly_one_OpenMP_library = 1;
+int _You_must_link_with_Intel_OpenMP_library = 1;
+#if KMP_OS_WINDOWS && (KMP_VERSION_MAJOR > 4)
+int _You_must_link_with_Microsoft_OpenMP_library = 1;
+#endif
+
+kmp_target_offload_kind_t __kmp_target_offload = tgt_default;
+
+// OMP Pause Resources
+kmp_pause_status_t __kmp_pause_status = kmp_not_paused;
+
+// end of file //
diff --git a/final/runtime/src/kmp_gsupport.cpp b/final/runtime/src/kmp_gsupport.cpp
new file mode 100644
index 0000000..d41e027
--- /dev/null
+++ b/final/runtime/src/kmp_gsupport.cpp
@@ -0,0 +1,1950 @@
+/*
+ * kmp_gsupport.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_atomic.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define MKLOC(loc, routine)                                                    \
+  static ident_t(loc) = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
+
+#include "kmp_ftn_os.h"
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_barrier");
+  KA_TRACE(20, ("GOMP_barrier: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmpc_barrier(&loc, gtid);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+// Mutual exclusion
+
+// The symbol that icc/ifort generates for unnamed for unnamed critical sections
+// - .gomp_critical_user_ - is defined using .comm in any objects reference it.
+// We can't reference it directly here in C code, as the symbol contains a ".".
+//
+// The RTL contains an assembly language definition of .gomp_critical_user_
+// with another symbol __kmp_unnamed_critical_addr initialized with it's
+// address.
+extern kmp_critical_name *__kmp_unnamed_critical_addr;
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_critical_start");
+  KA_TRACE(20, ("GOMP_critical_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_critical(&loc, gtid, __kmp_unnamed_critical_addr);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_END)(void) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_critical_end");
+  KA_TRACE(20, ("GOMP_critical_end: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_end_critical(&loc, gtid, __kmp_unnamed_critical_addr);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_NAME_START)(void **pptr) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_critical_name_start");
+  KA_TRACE(20, ("GOMP_critical_name_start: T#%d\n", gtid));
+  __kmpc_critical(&loc, gtid, (kmp_critical_name *)pptr);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_NAME_END)(void **pptr) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_critical_name_end");
+  KA_TRACE(20, ("GOMP_critical_name_end: T#%d\n", gtid));
+  __kmpc_end_critical(&loc, gtid, (kmp_critical_name *)pptr);
+}
+
+// The Gnu codegen tries to use locked operations to perform atomic updates
+// inline.  If it can't, then it calls GOMP_atomic_start() before performing
+// the update and GOMP_atomic_end() afterward, regardless of the data type.
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  __ompt_thread_assign_wait_id(0);
+#endif
+
+  __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_END)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_atomic_end: T#%d\n", gtid));
+  __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
+}
+
+int KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_single_start");
+  KA_TRACE(20, ("GOMP_single_start: T#%d\n", gtid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  // 3rd parameter == FALSE prevents kmp_enter_single from pushing a
+  // workshare when USE_CHECKS is defined.  We need to avoid the push,
+  // as there is no corresponding GOMP_single_end() call.
+  kmp_int32 rc = __kmp_enter_single(gtid, &loc, FALSE);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(gtid);
+
+  if (ompt_enabled.enabled) {
+    if (rc) {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_executor, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    } else {
+      if (ompt_enabled.ompt_callback_work) {
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_begin,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_callbacks.ompt_callback(ompt_callback_work)(
+            ompt_work_single_other, ompt_scope_end,
+            &(team->t.ompt_team_info.parallel_data),
+            &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
+            1, OMPT_GET_RETURN_ADDRESS(0));
+      }
+    }
+  }
+#endif
+
+  return rc;
+}
+
+void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void) {
+  void *retval;
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_single_copy_start");
+  KA_TRACE(20, ("GOMP_single_copy_start: T#%d\n", gtid));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  // If this is the first thread to enter, return NULL.  The generated code will
+  // then call GOMP_single_copy_end() for this thread only, with the
+  // copyprivate data pointer as an argument.
+  if (__kmp_enter_single(gtid, &loc, FALSE))
+    return NULL;
+
+// Wait for the first thread to set the copyprivate data pointer,
+// and for all other threads to reach this point.
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  // Retrieve the value of the copyprivate data point, and wait for all
+  // threads to do likewise, then return.
+  retval = __kmp_team_from_gtid(gtid)->t.t_copypriv_data;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+  return retval;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_END)(void *data) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_single_copy_end: T#%d\n", gtid));
+
+  // Set the copyprivate data pointer fo the team, then hit the barrier so that
+  // the other threads will continue on and read it.  Hit another barrier before
+  // continuing, so that the know that the copyprivate data pointer has been
+  // propagated to all threads before trying to reuse the t_copypriv_data field.
+  __kmp_team_from_gtid(gtid)->t.t_copypriv_data = data;
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_ordered_start");
+  KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_ordered(&loc, gtid);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_ordered_end");
+  KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmpc_end_ordered(&loc, gtid);
+}
+
+// Dispatch macro defs
+//
+// They come in two flavors: 64-bit unsigned, and either 32-bit signed
+// (IA-32 architecture) or 64-bit signed (Intel(R) 64).
+
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
+#define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_4
+#define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_4
+#define KMP_DISPATCH_NEXT __kmpc_dispatch_next_4
+#else
+#define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_8
+#define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_8
+#define KMP_DISPATCH_NEXT __kmpc_dispatch_next_8
+#endif /* KMP_ARCH_X86 */
+
+#define KMP_DISPATCH_INIT_ULL __kmp_aux_dispatch_init_8u
+#define KMP_DISPATCH_FINI_CHUNK_ULL __kmp_aux_dispatch_fini_chunk_8u
+#define KMP_DISPATCH_NEXT_ULL __kmpc_dispatch_next_8u
+
+// The parallel contruct
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+    void
+    __kmp_GOMP_microtask_wrapper(int *gtid, int *npr, void (*task)(void *),
+                                 void *data) {
+#if OMPT_SUPPORT
+  kmp_info_t *thr;
+  ompt_frame_t *ompt_frame;
+  ompt_state_t enclosing_state;
+
+  if (ompt_enabled.enabled) {
+    // get pointer to thread data structure
+    thr = __kmp_threads[*gtid];
+
+    // save enclosing task state; set current state for task
+    enclosing_state = thr->th.ompt_thread_info.state;
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+    // set task frame
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    // clear task frame
+    ompt_frame->exit_frame = ompt_data_none;
+
+    // restore enclosing state
+    thr->th.ompt_thread_info.state = enclosing_state;
+  }
+#endif
+}
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+    void
+    __kmp_GOMP_parallel_microtask_wrapper(int *gtid, int *npr,
+                                          void (*task)(void *), void *data,
+                                          unsigned num_threads, ident_t *loc,
+                                          enum sched_type schedule, long start,
+                                          long end, long incr,
+                                          long chunk_size) {
+  // Intialize the loop worksharing construct.
+
+  KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
+                    schedule != kmp_sch_static);
+
+#if OMPT_SUPPORT
+  kmp_info_t *thr;
+  ompt_frame_t *ompt_frame;
+  ompt_state_t enclosing_state;
+
+  if (ompt_enabled.enabled) {
+    thr = __kmp_threads[*gtid];
+    // save enclosing task state; set current state for task
+    enclosing_state = thr->th.ompt_thread_info.state;
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+
+    // set task frame
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  // Now invoke the microtask.
+  task(data);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    // clear task frame
+    ompt_frame->exit_frame = ompt_data_none;
+
+    // reset enclosing state
+    thr->th.ompt_thread_info.state = enclosing_state;
+  }
+#endif
+}
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+    void
+    __kmp_GOMP_fork_call(ident_t *loc, int gtid, void (*unwrapped_task)(void *),
+                         microtask_t wrapper, int argc, ...) {
+  int rc;
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  int tid = __kmp_tid_from_gtid(gtid);
+
+  va_list ap;
+  va_start(ap, argc);
+
+  rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper,
+                       __kmp_invoke_task_func,
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                       &ap
+#else
+                       ap
+#endif
+                       );
+
+  va_end(ap);
+
+  if (rc) {
+    __kmp_run_before_invoked_task(gtid, tid, thr, team);
+  }
+
+#if OMPT_SUPPORT
+  int ompt_team_size;
+  if (ompt_enabled.enabled) {
+    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+
+    // implicit task callback
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_team_size = __kmp_team_from_gtid(gtid)->t.t_nproc;
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), ompt_team_size, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+      task_info->thread_num = __kmp_tid_from_gtid(gtid);
+    }
+    thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+  }
+#endif
+}
+
+static void __kmp_GOMP_serialized_parallel(ident_t *loc, kmp_int32 gtid,
+                                           void (*task)(void *)) {
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+  __kmp_serialized_parallel(loc, gtid);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *),
+                                                       void *data,
+                                                       unsigned num_threads) {
+  int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+  ompt_frame_t *parent_frame, *frame;
+
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
+    parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+
+  MKLOC(loc, "GOMP_parallel_start");
+  KA_TRACE(20, ("GOMP_parallel_start: T#%d\n", gtid));
+
+  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+    if (num_threads != 0) {
+      __kmp_push_num_threads(&loc, gtid, num_threads);
+    }
+    __kmp_GOMP_fork_call(&loc, gtid, task,
+                         (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
+                         data);
+  } else {
+    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+  }
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &frame, NULL, NULL);
+    frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
+  int gtid = __kmp_get_gtid();
+  kmp_info_t *thr;
+
+  thr = __kmp_threads[gtid];
+
+  MKLOC(loc, "GOMP_parallel_end");
+  KA_TRACE(20, ("GOMP_parallel_end: T#%d\n", gtid));
+
+  if (!thr->th.th_team->t.t_serialized) {
+    __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
+                                 thr->th.th_team);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      // Implicit task is finished here, in the barrier we might schedule
+      // deferred tasks,
+      // these don't see the implicit task on the stack
+      OMPT_CUR_TASK_INFO(thr)->frame.exit_frame = ompt_data_none;
+    }
+#endif
+
+    __kmp_join_call(&loc, gtid
+#if OMPT_SUPPORT
+                    ,
+                    fork_context_gnu
+#endif
+                    );
+  } else {
+    __kmpc_end_serialized_parallel(&loc, gtid);
+  }
+}
+
+// Loop worksharing constructs
+
+// The Gnu codegen passes in an exclusive upper bound for the overall range,
+// but the libguide dispatch code expects an inclusive upper bound, hence the
+// "end - incr" 5th argument to KMP_DISPATCH_INIT (and the " ub - str" 11th
+// argument to __kmp_GOMP_fork_call).
+//
+// Conversely, KMP_DISPATCH_NEXT returns and inclusive upper bound in *p_ub,
+// but the Gnu codegen expects an excluside upper bound, so the adjustment
+// "*p_ub += stride" compenstates for the discrepancy.
+//
+// Correction: the gnu codegen always adjusts the upper bound by +-1, not the
+// stride value.  We adjust the dispatch parameters accordingly (by +-1), but
+// we still adjust p_ub by the actual stride value.
+//
+// The "runtime" versions do not take a chunk_sz parameter.
+//
+// The profile lib cannot support construct checking of unordered loops that
+// are predetermined by the compiler to be statically scheduled, as the gcc
+// codegen will not always emit calls to GOMP_loop_static_next() to get the
+// next iteration.  Instead, it emits inline code to call omp_get_thread_num()
+// num and calculate the iteration space using the result.  It doesn't do this
+// with ordered static loop, so they can be checked.
+
+#if OMPT_SUPPORT
+#define IF_OMPT_SUPPORT(code) code
+#else
+#define IF_OMPT_SUPPORT(code)
+#endif
+
+#define LOOP_START(func, schedule)                                             \
+  int func(long lb, long ub, long str, long chunk_sz, long *p_lb,              \
+           long *p_ub) {                                                       \
+    int status;                                                                \
+    long stride;                                                               \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
+                        (schedule) != kmp_sch_static);                         \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define LOOP_RUNTIME_START(func, schedule)                                     \
+  int func(long lb, long ub, long str, long *p_lb, long *p_ub) {               \
+    int status;                                                                \
+    long stride;                                                               \
+    long chunk_sz = 0;                                                         \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define KMP_DOACROSS_FINI(status, gtid)                                        \
+  if (!status && __kmp_threads[gtid]->th.th_dispatch->th_doacross_flags) {     \
+    __kmpc_doacross_fini(NULL, gtid);                                          \
+  }
+
+#define LOOP_NEXT(func, fini_code)                                             \
+  int func(long *p_lb, long *p_ub) {                                           \
+    int status;                                                                \
+    long stride;                                                               \
+    int gtid = __kmp_get_gtid();                                               \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
+                                                                               \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
+    fini_code status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,    \
+                                         (kmp_int *)p_ub, (kmp_int *)&stride); \
+    if (status) {                                                              \
+      *p_ub += (stride > 0) ? 1 : -1;                                          \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid)                                            \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, stride 0x%lx, " \
+                       "returning %d\n",                                       \
+         gtid, *p_lb, *p_ub, stride, status));                                 \
+    return status;                                                             \
+  }
+
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_START), kmp_sch_static)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT), {})
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START),
+           kmp_sch_dynamic_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT), {})
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_START),
+           kmp_sch_guided_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT), {})
+LOOP_RUNTIME_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START),
+                   kmp_sch_runtime)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT), {})
+
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START),
+           kmp_ord_static)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START),
+           kmp_ord_dynamic_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START),
+           kmp_ord_guided_chunked)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+LOOP_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START),
+    kmp_ord_runtime)
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT),
+          { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
+
+#define LOOP_DOACROSS_START(func, schedule)                                    \
+  bool func(unsigned ncounts, long *counts, long chunk_sz, long *p_lb,         \
+            long *p_ub) {                                                      \
+    int status;                                                                \
+    long stride, lb, ub, str;                                                  \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, ncounts %u, lb 0x%lx, ub 0x%lx, str " \
+                                "0x%lx, chunk_sz "                             \
+                                "0x%lx\n",                                     \
+                  gtid, ncounts, lb, ub, str, chunk_sz));                      \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
+                        (schedule) != kmp_sch_static);                         \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+#define LOOP_DOACROSS_RUNTIME_START(func, schedule)                            \
+  int func(unsigned ncounts, long *counts, long *p_lb, long *p_ub) {           \
+    int status;                                                                \
+    long stride, lb, ub, str;                                                  \
+    long chunk_sz = 0;                                                         \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
+                        (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
+      status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
+                                 (kmp_int *)p_ub, (kmp_int *)&stride);         \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START),
+    kmp_sch_static)
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START),
+    kmp_sch_dynamic_chunked)
+LOOP_DOACROSS_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START),
+    kmp_sch_guided_chunked)
+LOOP_DOACROSS_RUNTIME_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START),
+    kmp_sch_runtime)
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_loop_end: T#%d\n", gtid))
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  KA_TRACE(20, ("GOMP_loop_end exit: T#%d\n", gtid))
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END_NOWAIT)(void) {
+  KA_TRACE(20, ("GOMP_loop_end_nowait: T#%d\n", __kmp_get_gtid()))
+}
+
+// Unsigned long long loop worksharing constructs
+//
+// These are new with gcc 4.4
+
+#define LOOP_START_ULL(func, schedule)                                         \
+  int func(int up, unsigned long long lb, unsigned long long ub,               \
+           unsigned long long str, unsigned long long chunk_sz,                \
+           unsigned long long *p_lb, unsigned long long *p_ub) {               \
+    int status;                                                                \
+    long long str2 = up ? ((long long)str) : -((long long)str);                \
+    long long stride;                                                          \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, up, lb, ub, str, chunk_sz));                           \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz,  \
+                            (schedule) != kmp_sch_static);                     \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str2);                                      \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define LOOP_RUNTIME_START_ULL(func, schedule)                                 \
+  int func(int up, unsigned long long lb, unsigned long long ub,               \
+           unsigned long long str, unsigned long long *p_lb,                   \
+           unsigned long long *p_ub) {                                         \
+    int status;                                                                \
+    long long str2 = up ? ((long long)str) : -((long long)str);                \
+    unsigned long long stride;                                                 \
+    unsigned long long chunk_sz = 0;                                           \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, up, lb, ub, str, chunk_sz));                           \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz,  \
+                            TRUE);                                             \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT((long long)stride == str2);                           \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    return status;                                                             \
+  }
+
+#define LOOP_NEXT_ULL(func, fini_code)                                         \
+  int func(unsigned long long *p_lb, unsigned long long *p_ub) {               \
+    int status;                                                                \
+    long long stride;                                                          \
+    int gtid = __kmp_get_gtid();                                               \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
+                                                                               \
+    fini_code status =                                                         \
+        KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,            \
+                              (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);       \
+    if (status) {                                                              \
+      *p_ub += (stride > 0) ? 1 : -1;                                          \
+    }                                                                          \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, stride 0x%llx, "  \
+                   "returning %d\n",                                           \
+         gtid, *p_lb, *p_ub, stride, status));                                 \
+    return status;                                                             \
+  }
+
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START),
+               kmp_sch_static)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT), {})
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START),
+               kmp_sch_dynamic_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT), {})
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START),
+               kmp_sch_guided_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT), {})
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START), kmp_sch_runtime)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT), {})
+
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START),
+               kmp_ord_static)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START),
+    kmp_ord_dynamic_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START),
+               kmp_ord_guided_chunked)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+LOOP_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START),
+    kmp_ord_runtime)
+LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT),
+              { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
+
+#define LOOP_DOACROSS_START_ULL(func, schedule)                                \
+  int func(unsigned ncounts, unsigned long long *counts,                       \
+           unsigned long long chunk_sz, unsigned long long *p_lb,              \
+           unsigned long long *p_ub) {                                         \
+    int status;                                                                \
+    long long stride, str, lb, ub;                                             \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, lb, ub, str, chunk_sz));                               \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
+                            (schedule) != kmp_sch_static);                     \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+#define LOOP_DOACROSS_RUNTIME_START_ULL(func, schedule)                        \
+  int func(unsigned ncounts, unsigned long long *counts,                       \
+           unsigned long long *p_lb, unsigned long long *p_ub) {               \
+    int status;                                                                \
+    unsigned long long stride, str, lb, ub;                                    \
+    unsigned long long chunk_sz = 0;                                           \
+    int gtid = __kmp_entry_gtid();                                             \
+    struct kmp_dim *dims =                                                     \
+        (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    for (unsigned i = 0; i < ncounts; ++i) {                                   \
+      dims[i].lo = 0;                                                          \
+      dims[i].up = counts[i] - 1;                                              \
+      dims[i].st = 1;                                                          \
+    }                                                                          \
+    __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
+    lb = 0;                                                                    \
+    ub = counts[0];                                                            \
+    str = 1;                                                                   \
+    KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
+                                "0x%llx, chunk_sz 0x%llx\n",                   \
+                  gtid, lb, ub, str, chunk_sz));                               \
+                                                                               \
+    if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
+      KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
+                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
+                            TRUE);                                             \
+      status =                                                                 \
+          KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
+                                (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
+      if (status) {                                                            \
+        KMP_DEBUG_ASSERT(stride == str);                                       \
+        *p_ub += (str > 0) ? 1 : -1;                                           \
+      }                                                                        \
+    } else {                                                                   \
+      status = 0;                                                              \
+    }                                                                          \
+    KMP_DOACROSS_FINI(status, gtid);                                           \
+                                                                               \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
+         gtid, *p_lb, *p_ub, status));                                         \
+    __kmp_free(dims);                                                          \
+    return status;                                                             \
+  }
+
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START),
+    kmp_sch_static)
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START),
+    kmp_sch_dynamic_chunked)
+LOOP_DOACROSS_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START),
+    kmp_sch_guided_chunked)
+LOOP_DOACROSS_RUNTIME_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START),
+    kmp_sch_runtime)
+
+// Combined parallel / loop worksharing constructs
+//
+// There are no ull versions (yet).
+
+#define PARALLEL_LOOP_START(func, schedule, ompt_pre, ompt_post)               \
+  void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
+            long ub, long str, long chunk_sz) {                                \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    ompt_pre();                                                                \
+                                                                               \
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                       \
+      if (num_threads != 0) {                                                  \
+        __kmp_push_num_threads(&loc, gtid, num_threads);                       \
+      }                                                                        \
+      __kmp_GOMP_fork_call(&loc, gtid, task,                                   \
+                           (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, \
+                           9, task, data, num_threads, &loc, (schedule), lb,   \
+                           (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);    \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                        \
+    } else {                                                                   \
+      __kmp_GOMP_serialized_parallel(&loc, gtid, task);                        \
+      IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                        \
+    }                                                                          \
+                                                                               \
+    KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
+                      (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
+                      (schedule) != kmp_sch_static);                           \
+                                                                               \
+    ompt_post();                                                               \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+
+#define OMPT_LOOP_PRE()                                                        \
+  ompt_frame_t *parent_frame;                                                  \
+  if (ompt_enabled.enabled) {                                                  \
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);   \
+    parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);                 \
+    OMPT_STORE_RETURN_ADDRESS(gtid);                                           \
+  }
+
+#define OMPT_LOOP_POST()                                                       \
+  if (ompt_enabled.enabled) {                                                  \
+    parent_frame->enter_frame = ompt_data_none;                                \
+  }
+
+#else
+
+#define OMPT_LOOP_PRE()
+
+#define OMPT_LOOP_POST()
+
+#endif
+
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START),
+    kmp_sch_static, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START),
+    kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START),
+    kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP_START(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START),
+    kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+
+// Tasking constructs
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
+                                             void (*copy_func)(void *, void *),
+                                             long arg_size, long arg_align,
+                                             bool if_cond, unsigned gomp_flags,
+                                             void **depend) {
+  MKLOC(loc, "GOMP_task");
+  int gtid = __kmp_entry_gtid();
+  kmp_int32 flags = 0;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+
+  KA_TRACE(20, ("GOMP_task: T#%d\n", gtid));
+
+  // The low-order bit is the "untied" flag
+  if (!(gomp_flags & 1)) {
+    input_flags->tiedness = 1;
+  }
+  // The second low-order bit is the "final" flag
+  if (gomp_flags & 2) {
+    input_flags->final = 1;
+  }
+  input_flags->native = 1;
+  // __kmp_task_alloc() sets up all other flags
+
+  if (!if_cond) {
+    arg_size = 0;
+  }
+
+  kmp_task_t *task = __kmp_task_alloc(
+      &loc, gtid, input_flags, sizeof(kmp_task_t),
+      arg_size ? arg_size + arg_align - 1 : 0, (kmp_routine_entry_t)func);
+
+  if (arg_size > 0) {
+    if (arg_align > 0) {
+      task->shareds = (void *)((((size_t)task->shareds) + arg_align - 1) /
+                               arg_align * arg_align);
+    }
+    // else error??
+
+    if (copy_func) {
+      (*copy_func)(task->shareds, data);
+    } else {
+      KMP_MEMCPY(task->shareds, data, arg_size);
+    }
+  }
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *current_task;
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    current_task = __kmp_threads[gtid]->th.th_current_task;
+    current_task->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  if (if_cond) {
+    if (gomp_flags & 8) {
+      KMP_ASSERT(depend);
+      const size_t ndeps = (kmp_intptr_t)depend[0];
+      const size_t nout = (kmp_intptr_t)depend[1];
+      kmp_depend_info_t dep_list[ndeps];
+
+      for (size_t i = 0U; i < ndeps; i++) {
+        dep_list[i].base_addr = (kmp_intptr_t)depend[2U + i];
+        dep_list[i].len = 0U;
+        dep_list[i].flags.in = 1;
+        dep_list[i].flags.out = (i < nout);
+      }
+      __kmpc_omp_task_with_deps(&loc, gtid, task, ndeps, dep_list, 0, NULL);
+    } else {
+      __kmpc_omp_task(&loc, gtid, task);
+    }
+  } else {
+#if OMPT_SUPPORT
+    ompt_thread_info_t oldInfo;
+    kmp_info_t *thread;
+    kmp_taskdata_t *taskdata;
+    if (ompt_enabled.enabled) {
+      // Store the threads states and restore them after the task
+      thread = __kmp_threads[gtid];
+      taskdata = KMP_TASK_TO_TASKDATA(task);
+      oldInfo = thread->th.ompt_thread_info;
+      thread->th.ompt_thread_info.wait_id = 0;
+      thread->th.ompt_thread_info.state = ompt_state_work_parallel;
+      taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      OMPT_STORE_RETURN_ADDRESS(gtid);
+    }
+#endif
+
+    __kmpc_omp_task_begin_if0(&loc, gtid, task);
+    func(data);
+    __kmpc_omp_task_complete_if0(&loc, gtid, task);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      thread->th.ompt_thread_info = oldInfo;
+      taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
+    }
+#endif
+  }
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+
+  KA_TRACE(20, ("GOMP_task exit: T#%d\n", gtid));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT)(void) {
+  MKLOC(loc, "GOMP_taskwait");
+  int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  KA_TRACE(20, ("GOMP_taskwait: T#%d\n", gtid));
+
+  __kmpc_omp_taskwait(&loc, gtid);
+
+  KA_TRACE(20, ("GOMP_taskwait exit: T#%d\n", gtid));
+}
+
+// Sections worksharing constructs
+//
+// For the sections construct, we initialize a dynamically scheduled loop
+// worksharing construct with lb 1 and stride 1, and use the iteration #'s
+// that its returns as sections ids.
+//
+// There are no special entry points for ordered sections, so we always use
+// the dynamically scheduled workshare, even if the sections aren't ordered.
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_START)(unsigned count) {
+  int status;
+  kmp_int lb, ub, stride;
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_sections_start");
+  KA_TRACE(20, ("GOMP_sections_start: T#%d\n", gtid));
+
+  KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+  status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
+  if (status) {
+    KMP_DEBUG_ASSERT(stride == 1);
+    KMP_DEBUG_ASSERT(lb > 0);
+    KMP_ASSERT(lb == ub);
+  } else {
+    lb = 0;
+  }
+
+  KA_TRACE(20, ("GOMP_sections_start exit: T#%d returning %u\n", gtid,
+                (unsigned)lb));
+  return (unsigned)lb;
+}
+
+unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_NEXT)(void) {
+  int status;
+  kmp_int lb, ub, stride;
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_sections_next");
+  KA_TRACE(20, ("GOMP_sections_next: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
+  if (status) {
+    KMP_DEBUG_ASSERT(stride == 1);
+    KMP_DEBUG_ASSERT(lb > 0);
+    KMP_ASSERT(lb == ub);
+  } else {
+    lb = 0;
+  }
+
+  KA_TRACE(
+      20, ("GOMP_sections_next exit: T#%d returning %u\n", gtid, (unsigned)lb));
+  return (unsigned)lb;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(
+    void (*task)(void *), void *data, unsigned num_threads, unsigned count) {
+  int gtid = __kmp_entry_gtid();
+
+#if OMPT_SUPPORT
+  ompt_frame_t *parent_frame;
+
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
+    parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+
+  MKLOC(loc, "GOMP_parallel_sections_start");
+  KA_TRACE(20, ("GOMP_parallel_sections_start: T#%d\n", gtid));
+
+  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+    if (num_threads != 0) {
+      __kmp_push_num_threads(&loc, gtid, num_threads);
+    }
+    __kmp_GOMP_fork_call(&loc, gtid, task,
+                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
+                         task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
+                         (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
+  } else {
+    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+  }
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    parent_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+  KA_TRACE(20, ("GOMP_parallel_sections_start exit: T#%d\n", gtid));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_sections_end: T#%d\n", gtid))
+
+#if OMPT_SUPPORT
+  ompt_frame_t *ompt_frame;
+  if (ompt_enabled.enabled) {
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_frame->enter_frame = ompt_data_none;
+  }
+#endif
+
+  KA_TRACE(20, ("GOMP_sections_end exit: T#%d\n", gtid))
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT)(void) {
+  KA_TRACE(20, ("GOMP_sections_end_nowait: T#%d\n", __kmp_get_gtid()))
+}
+
+// libgomp has an empty function for GOMP_taskyield as of 2013-10-10
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKYIELD)(void) {
+  KA_TRACE(20, ("GOMP_taskyield: T#%d\n", __kmp_get_gtid()))
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *),
+                                                 void *data,
+                                                 unsigned num_threads,
+                                                 unsigned int flags) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_parallel");
+  KA_TRACE(20, ("GOMP_parallel: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  ompt_task_info_t *parent_task_info, *task_info;
+  if (ompt_enabled.enabled) {
+    parent_task_info = __ompt_get_task_info_object(0);
+    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+    if (num_threads != 0) {
+      __kmp_push_num_threads(&loc, gtid, num_threads);
+    }
+    if (flags != 0) {
+      __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
+    }
+    __kmp_GOMP_fork_call(&loc, gtid, task,
+                         (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
+                         data);
+  } else {
+    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+  }
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    task_info = __ompt_get_task_info_object(0);
+    task_info->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+  task(data);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+  }
+#endif
+  KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    task_info->frame.exit_frame = ompt_data_none;
+    parent_task_info->frame.enter_frame = ompt_data_none;
+  }
+#endif
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
+                                                          void *data,
+                                                          unsigned num_threads,
+                                                          unsigned count,
+                                                          unsigned flags) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_parallel_sections");
+  KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+    if (num_threads != 0) {
+      __kmp_push_num_threads(&loc, gtid, num_threads);
+    }
+    if (flags != 0) {
+      __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
+    }
+    __kmp_GOMP_fork_call(&loc, gtid, task,
+                         (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
+                         task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
+                         (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
+  } else {
+    __kmp_GOMP_serialized_parallel(&loc, gtid, task);
+  }
+
+#if OMPT_SUPPORT
+  OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+  task(data);
+  KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
+  KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
+}
+
+#define PARALLEL_LOOP(func, schedule, ompt_pre, ompt_post)                     \
+  void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
+            long ub, long str, long chunk_sz, unsigned flags) {                \
+    int gtid = __kmp_entry_gtid();                                             \
+    MKLOC(loc, KMP_STR(func));                                                 \
+    KA_TRACE(                                                                  \
+        20,                                                                    \
+        (KMP_STR(                                                              \
+             func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
+         gtid, lb, ub, str, chunk_sz));                                        \
+                                                                               \
+    ompt_pre();                                                                \
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                       \
+      if (num_threads != 0) {                                                  \
+        __kmp_push_num_threads(&loc, gtid, num_threads);                       \
+      }                                                                        \
+      if (flags != 0) {                                                        \
+        __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);              \
+      }                                                                        \
+      __kmp_GOMP_fork_call(&loc, gtid, task,                                   \
+                           (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, \
+                           9, task, data, num_threads, &loc, (schedule), lb,   \
+                           (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);    \
+    } else {                                                                   \
+      __kmp_GOMP_serialized_parallel(&loc, gtid, task);                        \
+    }                                                                          \
+                                                                               \
+    IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
+    KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
+                      (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
+                      (schedule) != kmp_sch_static);                           \
+    task(data);                                                                \
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();                         \
+    ompt_post();                                                               \
+                                                                               \
+    KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
+  }
+
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC),
+              kmp_sch_static, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC),
+              kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED),
+              kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME),
+              kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_START)(void) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_taskgroup_start");
+  KA_TRACE(20, ("GOMP_taskgroup_start: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  __kmpc_taskgroup(&loc, gtid);
+
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_END)(void) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_taskgroup_end");
+  KA_TRACE(20, ("GOMP_taskgroup_end: T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled)
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+
+  __kmpc_end_taskgroup(&loc, gtid);
+
+  return;
+}
+
+static kmp_int32 __kmp_gomp_to_omp_cancellation_kind(int gomp_kind) {
+  kmp_int32 cncl_kind = 0;
+  switch (gomp_kind) {
+  case 1:
+    cncl_kind = cancel_parallel;
+    break;
+  case 2:
+    cncl_kind = cancel_loop;
+    break;
+  case 4:
+    cncl_kind = cancel_sections;
+    break;
+  case 8:
+    cncl_kind = cancel_taskgroup;
+    break;
+  }
+  return cncl_kind;
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CANCELLATION_POINT)(int which) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_cancellation_point");
+  KA_TRACE(20, ("GOMP_cancellation_point: T#%d which:%d\n", gtid, which));
+  kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
+  return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CANCEL)(int which, bool do_cancel) {
+  int gtid = __kmp_get_gtid();
+  MKLOC(loc, "GOMP_cancel");
+  KA_TRACE(20, ("GOMP_cancel: T#%d which:%d do_cancel:%d\n", gtid, which,
+                (int)do_cancel));
+  kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
+
+  if (do_cancel == FALSE) {
+    return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
+  } else {
+    return __kmpc_cancel(&loc, gtid, cncl_kind);
+  }
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER_CANCEL)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_barrier_cancel: T#%d\n", gtid));
+  return __kmp_barrier_gomp_cancel(gtid);
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_sections_end_cancel: T#%d\n", gtid));
+  return __kmp_barrier_gomp_cancel(gtid);
+}
+
+// Return true if cancellation should take place, false otherwise
+bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END_CANCEL)(void) {
+  int gtid = __kmp_get_gtid();
+  KA_TRACE(20, ("GOMP_loop_end_cancel: T#%d\n", gtid));
+  return __kmp_barrier_gomp_cancel(gtid);
+}
+
+// All target functions are empty as of 2014-05-29
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET)(int device, void (*fn)(void *),
+                                               const void *openmp_target,
+                                               size_t mapnum, void **hostaddrs,
+                                               size_t *sizes,
+                                               unsigned char *kinds) {
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_DATA)(
+    int device, const void *openmp_target, size_t mapnum, void **hostaddrs,
+    size_t *sizes, unsigned char *kinds) {
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_END_DATA)(void) { return; }
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_UPDATE)(
+    int device, const void *openmp_target, size_t mapnum, void **hostaddrs,
+    size_t *sizes, unsigned char *kinds) {
+  return;
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TEAMS)(unsigned int num_teams,
+                                              unsigned int thread_limit) {
+  return;
+}
+
+// Task duplication function which copies src to dest (both are
+// preallocated task structures)
+static void __kmp_gomp_task_dup(kmp_task_t *dest, kmp_task_t *src,
+                                kmp_int32 last_private) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(src);
+  if (taskdata->td_copy_func) {
+    (taskdata->td_copy_func)(dest->shareds, src->shareds);
+  }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+template <typename T>
+void __GOMP_taskloop(void (*func)(void *), void *data,
+                     void (*copy_func)(void *, void *), long arg_size,
+                     long arg_align, unsigned gomp_flags,
+                     unsigned long num_tasks, int priority, T start, T end,
+                     T step) {
+  typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+  MKLOC(loc, "GOMP_taskloop");
+  int sched;
+  T *loop_bounds;
+  int gtid = __kmp_entry_gtid();
+  kmp_int32 flags = 0;
+  int if_val = gomp_flags & (1u << 10);
+  int nogroup = gomp_flags & (1u << 11);
+  int up = gomp_flags & (1u << 8);
+  p_task_dup_t task_dup = NULL;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    buff = __kmp_str_format(
+        "GOMP_taskloop: T#%%d: func:%%p data:%%p copy_func:%%p "
+        "arg_size:%%ld arg_align:%%ld gomp_flags:0x%%x num_tasks:%%lu "
+        "priority:%%d start:%%%s end:%%%s step:%%%s\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
+    KA_TRACE(20, (buff, gtid, func, data, copy_func, arg_size, arg_align,
+                  gomp_flags, num_tasks, priority, start, end, step));
+    __kmp_str_free(&buff);
+  }
+#endif
+  KMP_ASSERT((size_t)arg_size >= 2 * sizeof(T));
+  KMP_ASSERT(arg_align > 0);
+  // The low-order bit is the "untied" flag
+  if (!(gomp_flags & 1)) {
+    input_flags->tiedness = 1;
+  }
+  // The second low-order bit is the "final" flag
+  if (gomp_flags & 2) {
+    input_flags->final = 1;
+  }
+  // Negative step flag
+  if (!up) {
+    // If step is flagged as negative, but isn't properly sign extended
+    // Then manually sign extend it.  Could be a short, int, char embedded
+    // in a long.  So cannot assume any cast.
+    if (step > 0) {
+      for (int i = sizeof(T) * CHAR_BIT - 1; i >= 0L; --i) {
+        // break at the first 1 bit
+        if (step & ((T)1 << i))
+          break;
+        step |= ((T)1 << i);
+      }
+    }
+  }
+  input_flags->native = 1;
+  // Figure out if none/grainsize/num_tasks clause specified
+  if (num_tasks > 0) {
+    if (gomp_flags & (1u << 9))
+      sched = 1; // grainsize specified
+    else
+      sched = 2; // num_tasks specified
+    // neither grainsize nor num_tasks specified
+  } else {
+    sched = 0;
+  }
+
+  // __kmp_task_alloc() sets up all other flags
+  kmp_task_t *task =
+      __kmp_task_alloc(&loc, gtid, input_flags, sizeof(kmp_task_t),
+                       arg_size + arg_align - 1, (kmp_routine_entry_t)func);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  taskdata->td_copy_func = copy_func;
+  taskdata->td_size_loop_bounds = sizeof(T);
+
+  // re-align shareds if needed and setup firstprivate copy constructors
+  // through the task_dup mechanism
+  task->shareds = (void *)((((size_t)task->shareds) + arg_align - 1) /
+                           arg_align * arg_align);
+  if (copy_func) {
+    task_dup = __kmp_gomp_task_dup;
+  }
+  KMP_MEMCPY(task->shareds, data, arg_size);
+
+  loop_bounds = (T *)task->shareds;
+  loop_bounds[0] = start;
+  loop_bounds[1] = end + (up ? -1 : 1);
+  __kmpc_taskloop(&loc, gtid, task, if_val, (kmp_uint64 *)&(loop_bounds[0]),
+                  (kmp_uint64 *)&(loop_bounds[1]), (kmp_int64)step, nogroup,
+                  sched, (kmp_uint64)num_tasks, (void *)task_dup);
+}
+
+// 4 byte version of GOMP_doacross_post
+// This verison needs to create a temporary array which converts 4 byte
+// integers into 8 byte integeres
+template <typename T, bool need_conversion = (sizeof(long) == 4)>
+void __kmp_GOMP_doacross_post(T *count);
+
+template <> void __kmp_GOMP_doacross_post<long, true>(long *count) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *th = __kmp_threads[gtid];
+  MKLOC(loc, "GOMP_doacross_post");
+  kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
+  kmp_int64 *vec =
+      (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
+  for (kmp_int64 i = 0; i < num_dims; ++i) {
+    vec[i] = (kmp_int64)count[i];
+  }
+  __kmpc_doacross_post(&loc, gtid, vec);
+  __kmp_thread_free(th, vec);
+}
+
+// 8 byte versions of GOMP_doacross_post
+// This version can just pass in the count array directly instead of creating
+// a temporary array
+template <> void __kmp_GOMP_doacross_post<long, false>(long *count) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_doacross_post");
+  __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
+}
+
+template <typename T> void __kmp_GOMP_doacross_wait(T first, va_list args) {
+  int gtid = __kmp_entry_gtid();
+  kmp_info_t *th = __kmp_threads[gtid];
+  MKLOC(loc, "GOMP_doacross_wait");
+  kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
+  kmp_int64 *vec =
+      (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
+  vec[0] = (kmp_int64)first;
+  for (kmp_int64 i = 1; i < num_dims; ++i) {
+    T item = va_arg(args, T);
+    vec[i] = (kmp_int64)item;
+  }
+  __kmpc_doacross_wait(&loc, gtid, vec);
+  __kmp_thread_free(th, vec);
+  return;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP)(
+    void (*func)(void *), void *data, void (*copy_func)(void *, void *),
+    long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks,
+    int priority, long start, long end, long step) {
+  __GOMP_taskloop<long>(func, data, copy_func, arg_size, arg_align, gomp_flags,
+                        num_tasks, priority, start, end, step);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP_ULL)(
+    void (*func)(void *), void *data, void (*copy_func)(void *, void *),
+    long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks,
+    int priority, unsigned long long start, unsigned long long end,
+    unsigned long long step) {
+  __GOMP_taskloop<unsigned long long>(func, data, copy_func, arg_size,
+                                      arg_align, gomp_flags, num_tasks,
+                                      priority, start, end, step);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_POST)(long *count) {
+  __kmp_GOMP_doacross_post(count);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_WAIT)(long first, ...) {
+  va_list args;
+  va_start(args, first);
+  __kmp_GOMP_doacross_wait<long>(first, args);
+  va_end(args);
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_POST)(
+    unsigned long long *count) {
+  int gtid = __kmp_entry_gtid();
+  MKLOC(loc, "GOMP_doacross_ull_post");
+  __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
+}
+
+void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT)(
+    unsigned long long first, ...) {
+  va_list args;
+  va_start(args, first);
+  __kmp_GOMP_doacross_wait<unsigned long long>(first, args);
+  va_end(args);
+}
+
+/* The following sections of code create aliases for the GOMP_* functions, then
+   create versioned symbols using the assembler directive .symver. This is only
+   pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in
+   kmp_os.h  */
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+// GOMP_1.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ATOMIC_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ATOMIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_BARRIER, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_NAME_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_NAME_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END_NOWAIT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_GUIDED_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_RUNTIME_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_STATIC_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ORDERED_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ORDERED_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START, 10,
+                   "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_NEXT, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_COPY_END, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_COPY_START, 10, "GOMP_1.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_START, 10, "GOMP_1.0");
+
+// GOMP_2.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASK, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKWAIT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START, 20,
+                   "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT, 20, "GOMP_2.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START, 20, "GOMP_2.0");
+
+// GOMP_3.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKYIELD, 30, "GOMP_3.0");
+
+// GOMP_4.0 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_SECTIONS, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_START, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_END, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_BARRIER_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CANCELLATION_POINT, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_DATA, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_END_DATA, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_UPDATE, 40, "GOMP_4.0");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS, 40, "GOMP_4.0");
+
+// GOMP_4.5 versioned symbols
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP_ULL, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_POST, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_WAIT, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_POST, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT, 45, "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START, 45,
+                   "GOMP_4.5");
+
+#endif // KMP_USE_VERSION_SYMBOLS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
diff --git a/final/runtime/src/kmp_i18n.cpp b/final/runtime/src/kmp_i18n.cpp
new file mode 100644
index 0000000..53c4427
--- /dev/null
+++ b/final/runtime/src/kmp_i18n.cpp
@@ -0,0 +1,871 @@
+/*
+ * kmp_i18n.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_i18n.h"
+
+#include "kmp.h"
+#include "kmp_debug.h"
+#include "kmp_io.h" // __kmp_printf.
+#include "kmp_lock.h"
+#include "kmp_os.h"
+
+#include <errno.h>
+#include <locale.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "kmp_environment.h"
+#include "kmp_i18n_default.inc"
+#include "kmp_str.h"
+
+#undef KMP_I18N_OK
+
+#define get_section(id) ((id) >> 16)
+#define get_number(id) ((id)&0xFFFF)
+
+kmp_msg_t __kmp_msg_null = {kmp_mt_dummy, 0, NULL, 0};
+static char const *no_message_available = "(No message available)";
+
+static void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message,
+                      va_list ap);
+
+enum kmp_i18n_cat_status {
+  KMP_I18N_CLOSED, // Not yet opened or closed.
+  KMP_I18N_OPENED, // Opened successfully, ready to use.
+  KMP_I18N_ABSENT // Opening failed, message catalog should not be used.
+}; // enum kmp_i18n_cat_status
+typedef enum kmp_i18n_cat_status kmp_i18n_cat_status_t;
+static volatile kmp_i18n_cat_status_t status = KMP_I18N_CLOSED;
+
+/* Message catalog is opened at first usage, so we have to synchronize opening
+   to avoid race and multiple openings.
+
+   Closing does not require synchronization, because catalog is closed very late
+   at library shutting down, when no other threads are alive.  */
+
+static void __kmp_i18n_do_catopen();
+static kmp_bootstrap_lock_t lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(lock);
+// `lock' variable may be placed into __kmp_i18n_catopen function because it is
+// used only by that function. But we afraid a (buggy) compiler may treat it
+// wrongly. So we put it outside of function just in case.
+
+void __kmp_i18n_catopen() {
+  if (status == KMP_I18N_CLOSED) {
+    __kmp_acquire_bootstrap_lock(&lock);
+    if (status == KMP_I18N_CLOSED) {
+      __kmp_i18n_do_catopen();
+    }
+    __kmp_release_bootstrap_lock(&lock);
+  }
+} // func __kmp_i18n_catopen
+
+/* Linux* OS and OS X* part */
+#if KMP_OS_UNIX
+#define KMP_I18N_OK
+
+#include <nl_types.h>
+
+#define KMP_I18N_NULLCAT ((nl_catd)(-1))
+static nl_catd cat = KMP_I18N_NULLCAT; // !!! Shall it be volatile?
+static char const *name =
+    (KMP_VERSION_MAJOR == 4 ? "libguide.cat" : "libomp.cat");
+
+/* Useful links:
+http://www.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html#tag_08_02
+http://www.opengroup.org/onlinepubs/000095399/functions/catopen.html
+http://www.opengroup.org/onlinepubs/000095399/functions/setlocale.html
+*/
+
+void __kmp_i18n_do_catopen() {
+  int english = 0;
+  char *lang = __kmp_env_get("LANG");
+  // TODO: What about LC_ALL or LC_MESSAGES?
+
+  KMP_DEBUG_ASSERT(status == KMP_I18N_CLOSED);
+  KMP_DEBUG_ASSERT(cat == KMP_I18N_NULLCAT);
+
+  english = lang == NULL || // In all these cases English language is used.
+            strcmp(lang, "") == 0 || strcmp(lang, " ") == 0 ||
+            // Workaround for Fortran RTL bug DPD200137873 "Fortran runtime
+            // resets LANG env var to space if it is not set".
+            strcmp(lang, "C") == 0 || strcmp(lang, "POSIX") == 0;
+
+  if (!english) { // English language is not yet detected, let us continue.
+    // Format of LANG is: [language[_territory][.codeset][@modifier]]
+    // Strip all parts except language.
+    char *tail = NULL;
+    __kmp_str_split(lang, '@', &lang, &tail);
+    __kmp_str_split(lang, '.', &lang, &tail);
+    __kmp_str_split(lang, '_', &lang, &tail);
+    english = (strcmp(lang, "en") == 0);
+  }
+
+  KMP_INTERNAL_FREE(lang);
+
+  // Do not try to open English catalog because internal messages are
+  // exact copy of messages in English catalog.
+  if (english) {
+    status = KMP_I18N_ABSENT; // mark catalog as absent so it will not
+    // be re-opened.
+    return;
+  }
+
+  cat = catopen(name, 0);
+  // TODO: Why do we pass 0 in flags?
+  status = (cat == KMP_I18N_NULLCAT ? KMP_I18N_ABSENT : KMP_I18N_OPENED);
+
+  if (status == KMP_I18N_ABSENT) {
+    if (__kmp_generate_warnings > kmp_warnings_low) {
+      // AC: only issue warning in case explicitly asked to
+      int error = errno; // Save errno immediately.
+      char *nlspath = __kmp_env_get("NLSPATH");
+      char *lang = __kmp_env_get("LANG");
+
+      // Infinite recursion will not occur -- status is KMP_I18N_ABSENT now, so
+      // __kmp_i18n_catgets() will not try to open catalog, but will return
+      // default message.
+      kmp_msg_t err_code = KMP_ERR(error);
+      __kmp_msg(kmp_ms_warning, KMP_MSG(CantOpenMessageCatalog, name), err_code,
+                KMP_HNT(CheckEnvVar, "NLSPATH", nlspath),
+                KMP_HNT(CheckEnvVar, "LANG", lang), __kmp_msg_null);
+      if (__kmp_generate_warnings == kmp_warnings_off) {
+        __kmp_str_free(&err_code.str);
+      }
+
+      KMP_INFORM(WillUseDefaultMessages);
+      KMP_INTERNAL_FREE(nlspath);
+      KMP_INTERNAL_FREE(lang);
+    }
+  } else { // status == KMP_I18N_OPENED
+    int section = get_section(kmp_i18n_prp_Version);
+    int number = get_number(kmp_i18n_prp_Version);
+    char const *expected = __kmp_i18n_default_table.sect[section].str[number];
+    // Expected version of the catalog.
+    kmp_str_buf_t version; // Actual version of the catalog.
+    __kmp_str_buf_init(&version);
+    __kmp_str_buf_print(&version, "%s", catgets(cat, section, number, NULL));
+
+    // String returned by catgets is invalid after closing catalog, so copy it.
+    if (strcmp(version.str, expected) != 0) {
+      __kmp_i18n_catclose(); // Close bad catalog.
+      status = KMP_I18N_ABSENT; // And mark it as absent.
+      if (__kmp_generate_warnings > kmp_warnings_low) {
+        // AC: only issue warning in case explicitly asked to
+        // And now print a warning using default messages.
+        char const *name = "NLSPATH";
+        char const *nlspath = __kmp_env_get(name);
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(WrongMessageCatalog, name, version.str, expected),
+                  KMP_HNT(CheckEnvVar, name, nlspath), __kmp_msg_null);
+        KMP_INFORM(WillUseDefaultMessages);
+        KMP_INTERNAL_FREE(CCAST(char *, nlspath));
+      } // __kmp_generate_warnings
+    }
+    __kmp_str_buf_free(&version);
+  }
+} // func __kmp_i18n_do_catopen
+
+void __kmp_i18n_catclose() {
+  if (status == KMP_I18N_OPENED) {
+    KMP_DEBUG_ASSERT(cat != KMP_I18N_NULLCAT);
+    catclose(cat);
+    cat = KMP_I18N_NULLCAT;
+  }
+  status = KMP_I18N_CLOSED;
+} // func __kmp_i18n_catclose
+
+char const *__kmp_i18n_catgets(kmp_i18n_id_t id) {
+
+  int section = get_section(id);
+  int number = get_number(id);
+  char const *message = NULL;
+
+  if (1 <= section && section <= __kmp_i18n_default_table.size) {
+    if (1 <= number && number <= __kmp_i18n_default_table.sect[section].size) {
+      if (status == KMP_I18N_CLOSED) {
+        __kmp_i18n_catopen();
+      }
+      if (status == KMP_I18N_OPENED) {
+        message = catgets(cat, section, number,
+                          __kmp_i18n_default_table.sect[section].str[number]);
+      }
+      if (message == NULL) {
+        message = __kmp_i18n_default_table.sect[section].str[number];
+      }
+    }
+  }
+  if (message == NULL) {
+    message = no_message_available;
+  }
+  return message;
+
+} // func __kmp_i18n_catgets
+
+#endif // KMP_OS_UNIX
+
+/* Windows* OS part. */
+
+#if KMP_OS_WINDOWS
+#define KMP_I18N_OK
+
+#include "kmp_environment.h"
+#include <windows.h>
+
+#define KMP_I18N_NULLCAT NULL
+static HMODULE cat = KMP_I18N_NULLCAT; // !!! Shall it be volatile?
+static char const *name =
+    (KMP_VERSION_MAJOR == 4 ? "libguide40ui.dll" : "libompui.dll");
+
+static kmp_i18n_table_t table = {0, NULL};
+// Messages formatted by FormatMessage() should be freed, but catgets()
+// interface assumes user will not free messages. So we cache all the retrieved
+// messages in the table, which are freed at catclose().
+static UINT const default_code_page = CP_OEMCP;
+static UINT code_page = default_code_page;
+
+static char const *___catgets(kmp_i18n_id_t id);
+static UINT get_code_page();
+static void kmp_i18n_table_free(kmp_i18n_table_t *table);
+
+static UINT get_code_page() {
+
+  UINT cp = default_code_page;
+  char const *value = __kmp_env_get("KMP_CODEPAGE");
+  if (value != NULL) {
+    if (_stricmp(value, "ANSI") == 0) {
+      cp = CP_ACP;
+    } else if (_stricmp(value, "OEM") == 0) {
+      cp = CP_OEMCP;
+    } else if (_stricmp(value, "UTF-8") == 0 || _stricmp(value, "UTF8") == 0) {
+      cp = CP_UTF8;
+    } else if (_stricmp(value, "UTF-7") == 0 || _stricmp(value, "UTF7") == 0) {
+      cp = CP_UTF7;
+    } else {
+      // !!! TODO: Issue a warning?
+    }
+  }
+  KMP_INTERNAL_FREE((void *)value);
+  return cp;
+
+} // func get_code_page
+
+static void kmp_i18n_table_free(kmp_i18n_table_t *table) {
+  int s;
+  int m;
+  for (s = 0; s < table->size; ++s) {
+    for (m = 0; m < table->sect[s].size; ++m) {
+      // Free message.
+      KMP_INTERNAL_FREE((void *)table->sect[s].str[m]);
+      table->sect[s].str[m] = NULL;
+    }
+    table->sect[s].size = 0;
+    // Free section itself.
+    KMP_INTERNAL_FREE((void *)table->sect[s].str);
+    table->sect[s].str = NULL;
+  }
+  table->size = 0;
+  KMP_INTERNAL_FREE((void *)table->sect);
+  table->sect = NULL;
+} // kmp_i18n_table_free
+
+void __kmp_i18n_do_catopen() {
+
+  LCID locale_id = GetThreadLocale();
+  WORD lang_id = LANGIDFROMLCID(locale_id);
+  WORD primary_lang_id = PRIMARYLANGID(lang_id);
+  kmp_str_buf_t path;
+
+  KMP_DEBUG_ASSERT(status == KMP_I18N_CLOSED);
+  KMP_DEBUG_ASSERT(cat == KMP_I18N_NULLCAT);
+
+  __kmp_str_buf_init(&path);
+
+  // Do not try to open English catalog because internal messages are exact copy
+  // of messages in English catalog.
+  if (primary_lang_id == LANG_ENGLISH) {
+    status = KMP_I18N_ABSENT; // mark catalog as absent so it will not
+    // be re-opened.
+    goto end;
+  }
+
+  // Construct resource DLL name.
+  /* Simple LoadLibrary( name ) is not suitable due to security issue (see
+     http://www.microsoft.com/technet/security/advisory/2269637.mspx). We have
+     to specify full path to the message catalog.  */
+  {
+    // Get handle of our DLL first.
+    HMODULE handle;
+    BOOL brc = GetModuleHandleEx(
+        GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+            GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+        reinterpret_cast<LPCSTR>(&__kmp_i18n_do_catopen), &handle);
+    if (!brc) { // Error occurred.
+      status = KMP_I18N_ABSENT; // mark catalog as absent so it will not be
+      // re-opened.
+      goto end;
+      // TODO: Enable multiple messages (KMP_MSG) to be passed to __kmp_msg; and
+      // print a proper warning.
+    }
+
+    // Now get path to the our DLL.
+    for (;;) {
+      DWORD drc = GetModuleFileName(handle, path.str, path.size);
+      if (drc == 0) { // Error occurred.
+        status = KMP_I18N_ABSENT;
+        goto end;
+      }
+      if (drc < path.size) {
+        path.used = drc;
+        break;
+      }
+      __kmp_str_buf_reserve(&path, path.size * 2);
+    }
+
+    // Now construct the name of message catalog.
+    kmp_str_fname fname;
+    __kmp_str_fname_init(&fname, path.str);
+    __kmp_str_buf_clear(&path);
+    __kmp_str_buf_print(&path, "%s%lu/%s", fname.dir,
+                        (unsigned long)(locale_id), name);
+    __kmp_str_fname_free(&fname);
+  }
+
+  // For security reasons, use LoadLibraryEx() and load message catalog as a
+  // data file.
+  cat = LoadLibraryEx(path.str, NULL, LOAD_LIBRARY_AS_DATAFILE);
+  status = (cat == KMP_I18N_NULLCAT ? KMP_I18N_ABSENT : KMP_I18N_OPENED);
+
+  if (status == KMP_I18N_ABSENT) {
+    if (__kmp_generate_warnings > kmp_warnings_low) {
+      // AC: only issue warning in case explicitly asked to
+      DWORD error = GetLastError();
+      // Infinite recursion will not occur -- status is KMP_I18N_ABSENT now, so
+      // __kmp_i18n_catgets() will not try to open catalog but will return
+      // default message.
+      /* If message catalog for another architecture found (e.g. OpenMP RTL for
+         IA-32 architecture opens libompui.dll for Intel(R) 64) Windows* OS
+         returns error 193 (ERROR_BAD_EXE_FORMAT). However, FormatMessage fails
+         to return a message for this error, so user will see:
+
+         OMP: Warning #2: Cannot open message catalog "1041\libompui.dll":
+         OMP: System error #193: (No system error message available)
+         OMP: Info #3: Default messages will be used.
+
+         Issue hint in this case so cause of trouble is more understandable. */
+      kmp_msg_t err_code = KMP_SYSERRCODE(error);
+      __kmp_msg(kmp_ms_warning, KMP_MSG(CantOpenMessageCatalog, path.str),
+                err_code, (error == ERROR_BAD_EXE_FORMAT
+                               ? KMP_HNT(BadExeFormat, path.str, KMP_ARCH_STR)
+                               : __kmp_msg_null),
+                __kmp_msg_null);
+      if (__kmp_generate_warnings == kmp_warnings_off) {
+        __kmp_str_free(&err_code.str);
+      }
+      KMP_INFORM(WillUseDefaultMessages);
+    }
+  } else { // status == KMP_I18N_OPENED
+
+    int section = get_section(kmp_i18n_prp_Version);
+    int number = get_number(kmp_i18n_prp_Version);
+    char const *expected = __kmp_i18n_default_table.sect[section].str[number];
+    kmp_str_buf_t version; // Actual version of the catalog.
+    __kmp_str_buf_init(&version);
+    __kmp_str_buf_print(&version, "%s", ___catgets(kmp_i18n_prp_Version));
+    // String returned by catgets is invalid after closing catalog, so copy it.
+    if (strcmp(version.str, expected) != 0) {
+      // Close bad catalog.
+      __kmp_i18n_catclose();
+      status = KMP_I18N_ABSENT; // And mark it as absent.
+      if (__kmp_generate_warnings > kmp_warnings_low) {
+        // And now print a warning using default messages.
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(WrongMessageCatalog, path.str, version.str, expected),
+                  __kmp_msg_null);
+        KMP_INFORM(WillUseDefaultMessages);
+      } // __kmp_generate_warnings
+    }
+    __kmp_str_buf_free(&version);
+  }
+  code_page = get_code_page();
+
+end:
+  __kmp_str_buf_free(&path);
+  return;
+} // func __kmp_i18n_do_catopen
+
+void __kmp_i18n_catclose() {
+  if (status == KMP_I18N_OPENED) {
+    KMP_DEBUG_ASSERT(cat != KMP_I18N_NULLCAT);
+    kmp_i18n_table_free(&table);
+    FreeLibrary(cat);
+    cat = KMP_I18N_NULLCAT;
+  }
+  code_page = default_code_page;
+  status = KMP_I18N_CLOSED;
+} // func __kmp_i18n_catclose
+
+/* We use FormatMessage() to get strings from catalog, get system error
+   messages, etc. FormatMessage() tends to return Windows* OS-style
+   end-of-lines, "\r\n". When string is printed, printf() also replaces all the
+   occurrences of "\n" with "\r\n" (again!), so sequences like "\r\r\r\n"
+   appear in output. It is not too good.
+
+   Additional mess comes from message catalog: Our catalog source en_US.mc file
+   (generated by message-converter.pl) contains only "\n" characters, but
+   en_US_msg_1033.bin file (produced by mc.exe) may contain "\r\n" or just "\n".
+   This mess goes from en_US_msg_1033.bin file to message catalog,
+   libompui.dll. For example, message
+
+   Error
+
+   (there is "\n" at the end) is compiled by mc.exe to "Error\r\n", while
+
+   OMP: Error %1!d!: %2!s!\n
+
+   (there is "\n" at the end as well) is compiled to "OMP: Error %1!d!:
+   %2!s!\r\n\n".
+
+   Thus, stripping all "\r" normalizes string and returns it to canonical form,
+   so printf() will produce correct end-of-line sequences.
+
+   ___strip_crs() serves for this purpose: it removes all the occurrences of
+   "\r" in-place and returns new length of string.  */
+static int ___strip_crs(char *str) {
+  int in = 0; // Input character index.
+  int out = 0; // Output character index.
+  for (;;) {
+    if (str[in] != '\r') {
+      str[out] = str[in];
+      ++out;
+    }
+    if (str[in] == 0) {
+      break;
+    }
+    ++in;
+  }
+  return out - 1;
+} // func __strip_crs
+
+static char const *___catgets(kmp_i18n_id_t id) {
+
+  char *result = NULL;
+  PVOID addr = NULL;
+  wchar_t *wmsg = NULL;
+  DWORD wlen = 0;
+  char *msg = NULL;
+  int len = 0;
+  int rc;
+
+  KMP_DEBUG_ASSERT(cat != KMP_I18N_NULLCAT);
+  wlen = // wlen does *not* include terminating null.
+      FormatMessageW(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                         FORMAT_MESSAGE_FROM_HMODULE |
+                         FORMAT_MESSAGE_IGNORE_INSERTS,
+                     cat, id,
+                     0, // LangId
+                     (LPWSTR)&addr,
+                     0, // Size in elements, not in bytes.
+                     NULL);
+  if (wlen <= 0) {
+    goto end;
+  }
+  wmsg = (wchar_t *)addr; // Warning: wmsg may be not nul-terminated!
+
+  // Calculate length of multibyte message.
+  // Since wlen does not include terminating null, len does not include it also.
+  len = WideCharToMultiByte(code_page,
+                            0, // Flags.
+                            wmsg, wlen, // Wide buffer and size.
+                            NULL, 0, // Buffer and size.
+                            NULL, NULL // Default char and used default char.
+                            );
+  if (len <= 0) {
+    goto end;
+  }
+
+  // Allocate memory.
+  msg = (char *)KMP_INTERNAL_MALLOC(len + 1);
+
+  // Convert wide message to multibyte one.
+  rc = WideCharToMultiByte(code_page,
+                           0, // Flags.
+                           wmsg, wlen, // Wide buffer and size.
+                           msg, len, // Buffer and size.
+                           NULL, NULL // Default char and used default char.
+                           );
+  if (rc <= 0 || rc > len) {
+    goto end;
+  }
+  KMP_DEBUG_ASSERT(rc == len);
+  len = rc;
+  msg[len] = 0; // Put terminating null to the end.
+
+  // Stripping all "\r" before stripping last end-of-line simplifies the task.
+  len = ___strip_crs(msg);
+
+  // Every message in catalog is terminated with "\n". Strip it.
+  if (len >= 1 && msg[len - 1] == '\n') {
+    --len;
+    msg[len] = 0;
+  }
+
+  // Everything looks ok.
+  result = msg;
+  msg = NULL;
+
+end:
+
+  if (msg != NULL) {
+    KMP_INTERNAL_FREE(msg);
+  }
+  if (wmsg != NULL) {
+    LocalFree(wmsg);
+  }
+
+  return result;
+
+} // ___catgets
+
+char const *__kmp_i18n_catgets(kmp_i18n_id_t id) {
+
+  int section = get_section(id);
+  int number = get_number(id);
+  char const *message = NULL;
+
+  if (1 <= section && section <= __kmp_i18n_default_table.size) {
+    if (1 <= number && number <= __kmp_i18n_default_table.sect[section].size) {
+      if (status == KMP_I18N_CLOSED) {
+        __kmp_i18n_catopen();
+      }
+      if (cat != KMP_I18N_NULLCAT) {
+        if (table.size == 0) {
+          table.sect = (kmp_i18n_section_t *)KMP_INTERNAL_CALLOC(
+              (__kmp_i18n_default_table.size + 2), sizeof(kmp_i18n_section_t));
+          table.size = __kmp_i18n_default_table.size;
+        }
+        if (table.sect[section].size == 0) {
+          table.sect[section].str = (const char **)KMP_INTERNAL_CALLOC(
+              __kmp_i18n_default_table.sect[section].size + 2,
+              sizeof(char const *));
+          table.sect[section].size =
+              __kmp_i18n_default_table.sect[section].size;
+        }
+        if (table.sect[section].str[number] == NULL) {
+          table.sect[section].str[number] = ___catgets(id);
+        }
+        message = table.sect[section].str[number];
+      }
+      if (message == NULL) {
+        // Catalog is not opened or message is not found, return default
+        // message.
+        message = __kmp_i18n_default_table.sect[section].str[number];
+      }
+    }
+  }
+  if (message == NULL) {
+    message = no_message_available;
+  }
+  return message;
+
+} // func __kmp_i18n_catgets
+
+#endif // KMP_OS_WINDOWS
+
+// -----------------------------------------------------------------------------
+
+#ifndef KMP_I18N_OK
+#error I18n support is not implemented for this OS.
+#endif // KMP_I18N_OK
+
+// -----------------------------------------------------------------------------
+
+void __kmp_i18n_dump_catalog(kmp_str_buf_t *buffer) {
+
+  struct kmp_i18n_id_range_t {
+    kmp_i18n_id_t first;
+    kmp_i18n_id_t last;
+  }; // struct kmp_i18n_id_range_t
+
+  static struct kmp_i18n_id_range_t ranges[] = {
+      {kmp_i18n_prp_first, kmp_i18n_prp_last},
+      {kmp_i18n_str_first, kmp_i18n_str_last},
+      {kmp_i18n_fmt_first, kmp_i18n_fmt_last},
+      {kmp_i18n_msg_first, kmp_i18n_msg_last},
+      {kmp_i18n_hnt_first, kmp_i18n_hnt_last}}; // ranges
+
+  int num_of_ranges = sizeof(ranges) / sizeof(struct kmp_i18n_id_range_t);
+  int range;
+  kmp_i18n_id_t id;
+
+  for (range = 0; range < num_of_ranges; ++range) {
+    __kmp_str_buf_print(buffer, "*** Set #%d ***\n", range + 1);
+    for (id = (kmp_i18n_id_t)(ranges[range].first + 1); id < ranges[range].last;
+         id = (kmp_i18n_id_t)(id + 1)) {
+      __kmp_str_buf_print(buffer, "%d: <<%s>>\n", id, __kmp_i18n_catgets(id));
+    }
+  }
+
+  __kmp_printf("%s", buffer->str);
+
+} // __kmp_i18n_dump_catalog
+
+// -----------------------------------------------------------------------------
+kmp_msg_t __kmp_msg_format(unsigned id_arg, ...) {
+
+  kmp_msg_t msg;
+  va_list args;
+  kmp_str_buf_t buffer;
+  __kmp_str_buf_init(&buffer);
+
+  va_start(args, id_arg);
+
+  // We use unsigned for the ID argument and explicitly cast it here to the
+  // right enumerator because variadic functions are not compatible with
+  // default promotions.
+  kmp_i18n_id_t id = (kmp_i18n_id_t)id_arg;
+
+#if KMP_OS_UNIX
+  // On Linux* OS and OS X*, printf() family functions process parameter
+  // numbers, for example:  "%2$s %1$s".
+  __kmp_str_buf_vprint(&buffer, __kmp_i18n_catgets(id), args);
+#elif KMP_OS_WINDOWS
+  // On Winodws, printf() family functions does not recognize GNU style
+  // parameter numbers, so we have to use FormatMessage() instead. It recognizes
+  // parameter numbers, e. g.:  "%2!s! "%1!s!".
+  {
+    LPTSTR str = NULL;
+    int len;
+    FormatMessage(FORMAT_MESSAGE_FROM_STRING | FORMAT_MESSAGE_ALLOCATE_BUFFER,
+                  __kmp_i18n_catgets(id), 0, 0, (LPTSTR)(&str), 0, &args);
+    len = ___strip_crs(str);
+    __kmp_str_buf_cat(&buffer, str, len);
+    LocalFree(str);
+  }
+#else
+#error
+#endif
+  va_end(args);
+  __kmp_str_buf_detach(&buffer);
+
+  msg.type = (kmp_msg_type_t)(id >> 16);
+  msg.num = id & 0xFFFF;
+  msg.str = buffer.str;
+  msg.len = buffer.used;
+
+  return msg;
+
+} // __kmp_msg_format
+
+// -----------------------------------------------------------------------------
+static char *sys_error(int err) {
+
+  char *message = NULL;
+
+#if KMP_OS_WINDOWS
+
+  LPVOID buffer = NULL;
+  int len;
+  DWORD rc;
+  rc = FormatMessage(
+      FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, err,
+      MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language.
+      (LPTSTR)&buffer, 0, NULL);
+  if (rc > 0) {
+    // Message formatted. Copy it (so we can free it later with normal free().
+    message = __kmp_str_format("%s", (char *)buffer);
+    len = ___strip_crs(message); // Delete carriage returns if any.
+    // Strip trailing newlines.
+    while (len > 0 && message[len - 1] == '\n') {
+      --len;
+    }
+    message[len] = 0;
+  } else {
+    // FormatMessage() failed to format system error message. GetLastError()
+    // would give us error code, which we would convert to message... this it
+    // dangerous recursion, which cannot clarify original error, so we will not
+    // even start it.
+  }
+  if (buffer != NULL) {
+    LocalFree(buffer);
+  }
+
+#else // Non-Windows* OS: Linux* OS or OS X*
+
+/* There are 2 incompatible versions of strerror_r:
+
+   char * strerror_r( int, char *, size_t );  // GNU version
+   int    strerror_r( int, char *, size_t );  // XSI version
+*/
+
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) ||                            \
+    (defined(__BIONIC__) && defined(_GNU_SOURCE) &&                            \
+     __ANDROID_API__ >= __ANDROID_API_M__)
+  // GNU version of strerror_r.
+
+  char buffer[2048];
+  char *const err_msg = strerror_r(err, buffer, sizeof(buffer));
+  // Do not eliminate this assignment to temporary variable, otherwise compiler
+  // would not issue warning if strerror_r() returns `int' instead of expected
+  // `char *'.
+  message = __kmp_str_format("%s", err_msg);
+
+#else // OS X*, FreeBSD* etc.
+  // XSI version of strerror_r.
+  int size = 2048;
+  char *buffer = (char *)KMP_INTERNAL_MALLOC(size);
+  int rc;
+  if (buffer == NULL) {
+    KMP_FATAL(MemoryAllocFailed);
+  }
+  rc = strerror_r(err, buffer, size);
+  if (rc == -1) {
+    rc = errno; // XSI version sets errno.
+  }
+  while (rc == ERANGE) { // ERANGE means the buffer is too small.
+    KMP_INTERNAL_FREE(buffer);
+    size *= 2;
+    buffer = (char *)KMP_INTERNAL_MALLOC(size);
+    if (buffer == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    rc = strerror_r(err, buffer, size);
+    if (rc == -1) {
+      rc = errno; // XSI version sets errno.
+    }
+  }
+  if (rc == 0) {
+    message = buffer;
+  } else { // Buffer is unused. Free it.
+    KMP_INTERNAL_FREE(buffer);
+  }
+
+#endif
+
+#endif /* KMP_OS_WINDOWS */
+
+  if (message == NULL) {
+    // TODO: I18n this message.
+    message = __kmp_str_format("%s", "(No system error message available)");
+  }
+  return message;
+} // sys_error
+
+// -----------------------------------------------------------------------------
+kmp_msg_t __kmp_msg_error_code(int code) {
+
+  kmp_msg_t msg;
+  msg.type = kmp_mt_syserr;
+  msg.num = code;
+  msg.str = sys_error(code);
+  msg.len = KMP_STRLEN(msg.str);
+  return msg;
+
+} // __kmp_msg_error_code
+
+// -----------------------------------------------------------------------------
+kmp_msg_t __kmp_msg_error_mesg(char const *mesg) {
+
+  kmp_msg_t msg;
+  msg.type = kmp_mt_syserr;
+  msg.num = 0;
+  msg.str = __kmp_str_format("%s", mesg);
+  msg.len = KMP_STRLEN(msg.str);
+  return msg;
+
+} // __kmp_msg_error_mesg
+
+// -----------------------------------------------------------------------------
+void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, va_list args) {
+  kmp_i18n_id_t format; // format identifier
+  kmp_msg_t fmsg; // formatted message
+  kmp_str_buf_t buffer;
+
+  if (severity != kmp_ms_fatal && __kmp_generate_warnings == kmp_warnings_off)
+    return; // no reason to form a string in order to not print it
+
+  __kmp_str_buf_init(&buffer);
+
+  // Format the primary message.
+  switch (severity) {
+  case kmp_ms_inform: {
+    format = kmp_i18n_fmt_Info;
+  } break;
+  case kmp_ms_warning: {
+    format = kmp_i18n_fmt_Warning;
+  } break;
+  case kmp_ms_fatal: {
+    format = kmp_i18n_fmt_Fatal;
+  } break;
+  default: { KMP_DEBUG_ASSERT(0); }
+  }
+  fmsg = __kmp_msg_format(format, message.num, message.str);
+  __kmp_str_free(&message.str);
+  __kmp_str_buf_cat(&buffer, fmsg.str, fmsg.len);
+  __kmp_str_free(&fmsg.str);
+
+  // Format other messages.
+  for (;;) {
+    message = va_arg(args, kmp_msg_t);
+    if (message.type == kmp_mt_dummy && message.str == NULL) {
+      break;
+    }
+    switch (message.type) {
+    case kmp_mt_hint: {
+      format = kmp_i18n_fmt_Hint;
+      // we cannot skip %1$ and only use %2$ to print the message without the
+      // number
+      fmsg = __kmp_msg_format(format, message.str);
+    } break;
+    case kmp_mt_syserr: {
+      format = kmp_i18n_fmt_SysErr;
+      fmsg = __kmp_msg_format(format, message.num, message.str);
+    } break;
+    default: { KMP_DEBUG_ASSERT(0); }
+    }
+    __kmp_str_free(&message.str);
+    __kmp_str_buf_cat(&buffer, fmsg.str, fmsg.len);
+    __kmp_str_free(&fmsg.str);
+  }
+
+  // Print formatted messages.
+  // This lock prevents multiple fatal errors on the same problem.
+  // __kmp_acquire_bootstrap_lock( & lock );    // GEH - This lock causing tests
+  // to hang on OS X*.
+  __kmp_printf("%s", buffer.str);
+  __kmp_str_buf_free(&buffer);
+
+  // __kmp_release_bootstrap_lock( & lock );  // GEH - this lock causing tests
+  // to hang on OS X*.
+
+} // __kmp_msg
+
+void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, ...) {
+  va_list args;
+  va_start(args, message);
+  __kmp_msg(severity, message, args);
+  va_end(args);
+}
+
+void __kmp_fatal(kmp_msg_t message, ...) {
+  va_list args;
+  va_start(args, message);
+  __kmp_msg(kmp_ms_fatal, message, args);
+  va_end(args);
+#if KMP_OS_WINDOWS
+  // Delay to give message a chance to appear before reaping
+  __kmp_thread_sleep(500);
+#endif
+  __kmp_abort_process();
+} // __kmp_fatal
+
+// end of file //
diff --git a/final/runtime/src/kmp_i18n.h b/final/runtime/src/kmp_i18n.h
new file mode 100644
index 0000000..9d79a21
--- /dev/null
+++ b/final/runtime/src/kmp_i18n.h
@@ -0,0 +1,178 @@
+/*
+ * kmp_i18n.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_I18N_H
+#define KMP_I18N_H
+
+#include "kmp_str.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/* kmp_i18n_id.inc defines kmp_i18n_id_t type. It is an enumeration with
+   identifiers of all the messages in the catalog. There is one special
+   identifier: kmp_i18n_null, which denotes absence of message. */
+#include "kmp_i18n_id.inc" // Generated file. Do not edit it manually.
+
+/* Low-level functions handling message catalog. __kmp_i18n_open() opens message
+   catalog, __kmp_i18n_closes() it. Explicit opening is not required: if message
+   catalog is not yet open, __kmp_i18n_catgets() will open it implicitly.
+   However, catalog should be explicitly closed, otherwise resources (mamory,
+   handles) may leak.
+
+   __kmp_i18n_catgets() returns read-only string. It should not be freed.
+
+   KMP_I18N_STR macro simplifies acces to strings in message catalog a bit.
+   Following two lines are equivalent:
+
+   __kmp_i18n_catgets( kmp_i18n_str_Warning )
+   KMP_I18N_STR( Warning )
+*/
+
+void __kmp_i18n_catopen();
+void __kmp_i18n_catclose();
+char const *__kmp_i18n_catgets(kmp_i18n_id_t id);
+
+#define KMP_I18N_STR(id) __kmp_i18n_catgets(kmp_i18n_str_##id)
+
+/* High-level interface for printing strings targeted to the user.
+
+   All the strings are divided into 3 types:
+   * messages,
+   * hints,
+   * system errors.
+
+   There are 3 kind of message severities:
+   * informational messages,
+   * warnings (non-fatal errors),
+   * fatal errors.
+
+   For example:
+     OMP: Warning #2: Cannot open message catalog "libguide.cat":   (1)
+     OMP: System error #2: No such file or directory                (2)
+     OMP: Hint: Please check NLSPATH environment variable.          (3)
+     OMP: Info #3: Default messages will be used.                   (4)
+
+   where
+   (1) is a message of warning severity,
+   (2) is a system error caused the previous warning,
+   (3) is a hint for the user how to fix the problem,
+   (4) is a message of informational severity.
+
+   Usage in complex cases (message is accompanied with hints and system errors):
+
+   int error = errno; // We need save errno immediately, because it may
+                      // be changed.
+   __kmp_msg(
+       kmp_ms_warning,                        // Severity
+       KMP_MSG( CantOpenMessageCatalog, name ), // Primary message
+       KMP_ERR( error ),                      // System error
+       KMP_HNT( CheckNLSPATH ),               // Hint
+       __kmp_msg_null                         // Variadic argument list finisher
+   );
+
+   Usage in simple cases (just a message, no system errors or hints):
+   KMP_INFORM( WillUseDefaultMessages );
+   KMP_WARNING( CantOpenMessageCatalog, name );
+   KMP_FATAL( StackOverlap );
+   KMP_SYSFAIL( "pthread_create", status );
+   KMP_CHECK_SYSFAIL( "pthread_create", status );
+   KMP_CHECK_SYSFAIL_ERRNO( "gettimeofday", status );
+*/
+
+enum kmp_msg_type {
+  kmp_mt_dummy = 0, // Special type for internal purposes.
+  kmp_mt_mesg =
+      4, // Primary OpenMP message, could be information, warning, or fatal.
+  kmp_mt_hint = 5, // Hint to the user.
+  kmp_mt_syserr = -1 // System error message.
+}; // enum kmp_msg_type
+typedef enum kmp_msg_type kmp_msg_type_t;
+
+struct kmp_msg {
+  kmp_msg_type_t type;
+  int num;
+  char *str;
+  int len;
+}; // struct kmp_message
+typedef struct kmp_msg kmp_msg_t;
+
+// Special message to denote the end of variadic list of arguments.
+extern kmp_msg_t __kmp_msg_null;
+
+// Helper functions. Creates messages either from message catalog or from
+// system. Note: these functions allocate memory. You should pass created
+// messages to __kmp_msg() function, it will print messages and destroy them.
+kmp_msg_t __kmp_msg_format(unsigned id_arg, ...);
+kmp_msg_t __kmp_msg_error_code(int code);
+kmp_msg_t __kmp_msg_error_mesg(char const *mesg);
+
+// Helper macros to make calls shorter.
+#define KMP_MSG(...) __kmp_msg_format(kmp_i18n_msg_##__VA_ARGS__)
+#define KMP_HNT(...) __kmp_msg_format(kmp_i18n_hnt_##__VA_ARGS__)
+#define KMP_SYSERRCODE(code) __kmp_msg_error_code(code)
+#define KMP_SYSERRMESG(mesg) __kmp_msg_error_mesg(mesg)
+#define KMP_ERR KMP_SYSERRCODE
+
+// Message severity.
+enum kmp_msg_severity {
+  kmp_ms_inform, // Just information for the user.
+  kmp_ms_warning, // Non-fatal error, execution continues.
+  kmp_ms_fatal // Fatal error, program aborts.
+}; // enum kmp_msg_severity
+typedef enum kmp_msg_severity kmp_msg_severity_t;
+
+// Primary function for printing messages for the user. The first message is
+// mandatory. Any number of system errors and hints may be specified. Argument
+// list must be finished with __kmp_msg_null.
+void __kmp_msg(kmp_msg_severity_t severity, kmp_msg_t message, ...);
+KMP_NORETURN void __kmp_fatal(kmp_msg_t message, ...);
+
+// Helper macros to make calls shorter in simple cases.
+#define KMP_INFORM(...)                                                        \
+  __kmp_msg(kmp_ms_inform, KMP_MSG(__VA_ARGS__), __kmp_msg_null)
+#define KMP_WARNING(...)                                                       \
+  __kmp_msg(kmp_ms_warning, KMP_MSG(__VA_ARGS__), __kmp_msg_null)
+#define KMP_FATAL(...) __kmp_fatal(KMP_MSG(__VA_ARGS__), __kmp_msg_null)
+#define KMP_SYSFAIL(func, error)                                               \
+  __kmp_fatal(KMP_MSG(FunctionError, func), KMP_SYSERRCODE(error),             \
+              __kmp_msg_null)
+
+// Check error, if not zero, generate fatal error message.
+#define KMP_CHECK_SYSFAIL(func, error)                                         \
+  {                                                                            \
+    if (error) {                                                               \
+      KMP_SYSFAIL(func, error);                                                \
+    }                                                                          \
+  }
+
+// Check status, if not zero, generate fatal error message using errno.
+#define KMP_CHECK_SYSFAIL_ERRNO(func, status)                                  \
+  {                                                                            \
+    if (status != 0) {                                                         \
+      int error = errno;                                                       \
+      KMP_SYSFAIL(func, error);                                                \
+    }                                                                          \
+  }
+
+#ifdef KMP_DEBUG
+void __kmp_i18n_dump_catalog(kmp_str_buf_t *buffer);
+#endif // KMP_DEBUG
+
+#ifdef __cplusplus
+}; // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_I18N_H
+
+// end of file //
diff --git a/final/runtime/src/kmp_import.cpp b/final/runtime/src/kmp_import.cpp
new file mode 100644
index 0000000..39d841d
--- /dev/null
+++ b/final/runtime/src/kmp_import.cpp
@@ -0,0 +1,33 @@
+/*
+ * kmp_import.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* Object generated from this source file is linked to Windows* OS DLL import
+   library (libompmd.lib) only! It is not a part of regular static or dynamic
+   OpenMP RTL. Any code that just needs to go in the libompmd.lib (but not in
+   libompmt.lib and libompmd.dll) should be placed in this file. */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*These symbols are required for mutual exclusion with Microsoft OpenMP RTL
+  (and compatibility with MS Compiler). */
+
+int _You_must_link_with_exactly_one_OpenMP_library = 1;
+int _You_must_link_with_Intel_OpenMP_library = 1;
+int _You_must_link_with_Microsoft_OpenMP_library = 1;
+
+#ifdef __cplusplus
+}
+#endif
+
+// end of file //
diff --git a/final/runtime/src/kmp_io.cpp b/final/runtime/src/kmp_io.cpp
new file mode 100644
index 0000000..4e6ea6a
--- /dev/null
+++ b/final/runtime/src/kmp_io.cpp
@@ -0,0 +1,229 @@
+/*
+ * kmp_io.cpp -- RTL IO
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef __ABSOFT_WIN
+#include <sys/types.h>
+#endif
+
+#include "kmp.h" // KMP_GTID_DNE, __kmp_debug_buf, etc
+#include "kmp_io.h"
+#include "kmp_lock.h"
+#include "kmp_os.h"
+#include "kmp_str.h"
+
+#if KMP_OS_WINDOWS
+#if KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable : 271 310)
+#endif
+#include <windows.h>
+#if KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+kmp_bootstrap_lock_t __kmp_stdio_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
+    __kmp_stdio_lock); /* Control stdio functions */
+kmp_bootstrap_lock_t __kmp_console_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
+    __kmp_console_lock); /* Control console initialization */
+
+#if KMP_OS_WINDOWS
+
+static HANDLE __kmp_stdout = NULL;
+static HANDLE __kmp_stderr = NULL;
+static int __kmp_console_exists = FALSE;
+static kmp_str_buf_t __kmp_console_buf;
+
+static int is_console(void) {
+  char buffer[128];
+  DWORD rc = 0;
+  DWORD err = 0;
+  // Try to get console title.
+  SetLastError(0);
+  // GetConsoleTitle does not reset last error in case of success or short
+  // buffer, so we need to clear it explicitly.
+  rc = GetConsoleTitle(buffer, sizeof(buffer));
+  if (rc == 0) {
+    // rc == 0 means getting console title failed. Let us find out why.
+    err = GetLastError();
+    // err == 0 means buffer too short (we suppose console exists).
+    // In Window applications we usually have err == 6 (invalid handle).
+  }
+  return rc > 0 || err == 0;
+}
+
+void __kmp_close_console(void) {
+  /* wait until user presses return before closing window */
+  /* TODO only close if a window was opened */
+  if (__kmp_console_exists) {
+    __kmp_stdout = NULL;
+    __kmp_stderr = NULL;
+    __kmp_str_buf_free(&__kmp_console_buf);
+    __kmp_console_exists = FALSE;
+  }
+}
+
+/* For windows, call this before stdout, stderr, or stdin are used.
+   It opens a console window and starts processing */
+static void __kmp_redirect_output(void) {
+  __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
+
+  if (!__kmp_console_exists) {
+    HANDLE ho;
+    HANDLE he;
+
+    __kmp_str_buf_init(&__kmp_console_buf);
+
+    AllocConsole();
+    // We do not check the result of AllocConsole because
+    //  1. the call is harmless
+    //  2. it is not clear how to communicate failue
+    //  3. we will detect failure later when we get handle(s)
+
+    ho = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (ho == INVALID_HANDLE_VALUE || ho == NULL) {
+
+      DWORD err = GetLastError();
+      // TODO: output error somehow (maybe message box)
+      __kmp_stdout = NULL;
+
+    } else {
+
+      __kmp_stdout = ho; // temporary code, need new global for ho
+    }
+    he = GetStdHandle(STD_ERROR_HANDLE);
+    if (he == INVALID_HANDLE_VALUE || he == NULL) {
+
+      DWORD err = GetLastError();
+      // TODO: output error somehow (maybe message box)
+      __kmp_stderr = NULL;
+
+    } else {
+
+      __kmp_stderr = he; // temporary code, need new global
+    }
+    __kmp_console_exists = TRUE;
+  }
+  __kmp_release_bootstrap_lock(&__kmp_console_lock);
+}
+
+#else
+#define __kmp_stderr (stderr)
+#define __kmp_stdout (stdout)
+#endif /* KMP_OS_WINDOWS */
+
+void __kmp_vprintf(enum kmp_io out_stream, char const *format, va_list ap) {
+#if KMP_OS_WINDOWS
+  if (!__kmp_console_exists) {
+    __kmp_redirect_output();
+  }
+  if (!__kmp_stderr && out_stream == kmp_err) {
+    return;
+  }
+  if (!__kmp_stdout && out_stream == kmp_out) {
+    return;
+  }
+#endif /* KMP_OS_WINDOWS */
+  auto stream = ((out_stream == kmp_out) ? __kmp_stdout : __kmp_stderr);
+
+  if (__kmp_debug_buf && __kmp_debug_buffer != NULL) {
+
+    int dc = __kmp_debug_count++ % __kmp_debug_buf_lines;
+    char *db = &__kmp_debug_buffer[dc * __kmp_debug_buf_chars];
+    int chars = 0;
+
+#ifdef KMP_DEBUG_PIDS
+    chars = KMP_SNPRINTF(db, __kmp_debug_buf_chars, "pid=%d: ",
+                         (kmp_int32)getpid());
+#endif
+    chars += KMP_VSNPRINTF(db, __kmp_debug_buf_chars, format, ap);
+
+    if (chars + 1 > __kmp_debug_buf_chars) {
+      if (chars + 1 > __kmp_debug_buf_warn_chars) {
+#if KMP_OS_WINDOWS
+        DWORD count;
+        __kmp_str_buf_print(&__kmp_console_buf, "OMP warning: Debugging buffer "
+                                                "overflow; increase "
+                                                "KMP_DEBUG_BUF_CHARS to %d\n",
+                            chars + 1);
+        WriteFile(stream, __kmp_console_buf.str, __kmp_console_buf.used, &count,
+                  NULL);
+        __kmp_str_buf_clear(&__kmp_console_buf);
+#else
+        fprintf(stream, "OMP warning: Debugging buffer overflow; "
+                        "increase KMP_DEBUG_BUF_CHARS to %d\n",
+                chars + 1);
+        fflush(stream);
+#endif
+        __kmp_debug_buf_warn_chars = chars + 1;
+      }
+      /* terminate string if overflow occurred */
+      db[__kmp_debug_buf_chars - 2] = '\n';
+      db[__kmp_debug_buf_chars - 1] = '\0';
+    }
+  } else {
+#if KMP_OS_WINDOWS
+    DWORD count;
+#ifdef KMP_DEBUG_PIDS
+    __kmp_str_buf_print(&__kmp_console_buf, "pid=%d: ", (kmp_int32)getpid());
+#endif
+    __kmp_str_buf_vprint(&__kmp_console_buf, format, ap);
+    WriteFile(stream, __kmp_console_buf.str, __kmp_console_buf.used, &count,
+              NULL);
+    __kmp_str_buf_clear(&__kmp_console_buf);
+#else
+#ifdef KMP_DEBUG_PIDS
+    fprintf(stream, "pid=%d: ", (kmp_int32)getpid());
+#endif
+    vfprintf(stream, format, ap);
+    fflush(stream);
+#endif
+  }
+}
+
+void __kmp_printf(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(kmp_err, format, ap);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
+}
+
+void __kmp_printf_no_lock(char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_vprintf(kmp_err, format, ap);
+
+  va_end(ap);
+}
+
+void __kmp_fprintf(enum kmp_io stream, char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(stream, format, ap);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
+}
diff --git a/final/runtime/src/kmp_io.h b/final/runtime/src/kmp_io.h
new file mode 100644
index 0000000..49afda5
--- /dev/null
+++ b/final/runtime/src/kmp_io.h
@@ -0,0 +1,38 @@
+/*
+ * kmp_io.h -- RTL IO header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_IO_H
+#define KMP_IO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+enum kmp_io { kmp_out = 0, kmp_err };
+
+extern kmp_bootstrap_lock_t __kmp_stdio_lock; /* Control stdio functions */
+extern kmp_bootstrap_lock_t
+    __kmp_console_lock; /* Control console initialization */
+
+extern void __kmp_vprintf(enum kmp_io stream, char const *format, va_list ap);
+extern void __kmp_printf(char const *format, ...);
+extern void __kmp_printf_no_lock(char const *format, ...);
+extern void __kmp_fprintf(enum kmp_io stream, char const *format, ...);
+extern void __kmp_close_console(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* KMP_IO_H */
diff --git a/final/runtime/src/kmp_itt.cpp b/final/runtime/src/kmp_itt.cpp
new file mode 100644
index 0000000..fa286ec
--- /dev/null
+++ b/final/runtime/src/kmp_itt.cpp
@@ -0,0 +1,160 @@
+#include "kmp_config.h"
+
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.cpp -- ITT Notify interface.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_itt.h"
+
+#if KMP_DEBUG
+#include "kmp_itt.inl"
+#endif
+
+#if USE_ITT_NOTIFY
+
+#include "ittnotify_config.h"
+__itt_global __kmp_ittapi_clean_global;
+extern __itt_global __kmp_itt__ittapi_global;
+kmp_int32 __kmp_barrier_domain_count;
+kmp_int32 __kmp_region_domain_count;
+__itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
+__itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
+__itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
+kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+__itt_domain *metadata_domain = NULL;
+__itt_string_handle *string_handle_imbl = NULL;
+__itt_string_handle *string_handle_loop = NULL;
+__itt_string_handle *string_handle_sngl = NULL;
+
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+#include "kmp_version.h"
+
+KMP_BUILD_ASSERT(sizeof(kmp_itt_mark_t) == sizeof(__itt_mark_type));
+
+/* Previously used warnings:
+
+   KMP_WARNING( IttAllNotifDisabled );
+   KMP_WARNING( IttObjNotifDisabled );
+   KMP_WARNING( IttMarkNotifDisabled );
+   KMP_WARNING( IttUnloadLibFailed, libittnotify );
+*/
+
+kmp_int32 __kmp_itt_prepare_delay = 0;
+kmp_bootstrap_lock_t __kmp_itt_debug_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_itt_debug_lock);
+
+#endif // USE_ITT_NOTIFY
+
+void __kmp_itt_reset() {
+#if USE_ITT_NOTIFY
+  __kmp_itt__ittapi_global = __kmp_ittapi_clean_global;
+#endif
+}
+
+void __kmp_itt_initialize() {
+
+// ITTNotify library is loaded and initialized at first call to any ittnotify
+// function, so we do not need to explicitly load it any more. Just report OMP
+// RTL version to ITTNotify.
+
+#if USE_ITT_NOTIFY
+  // Backup a clean global state
+  __kmp_ittapi_clean_global = __kmp_itt__ittapi_global;
+
+  // Report OpenMP RTL version.
+  kmp_str_buf_t buf;
+  __itt_mark_type version;
+  __kmp_str_buf_init(&buf);
+  __kmp_str_buf_print(&buf, "OMP RTL Version %d.%d.%d", __kmp_version_major,
+                      __kmp_version_minor, __kmp_version_build);
+  if (__itt_api_version_ptr != NULL) {
+    __kmp_str_buf_print(&buf, ":%s", __itt_api_version());
+  }
+  version = __itt_mark_create(buf.str);
+  __itt_mark(version, NULL);
+  __kmp_str_buf_free(&buf);
+#endif
+
+} // __kmp_itt_initialize
+
+void __kmp_itt_destroy() {
+#if USE_ITT_NOTIFY
+  __kmp_itt_fini_ittlib();
+#endif
+} // __kmp_itt_destroy
+
+extern "C" void __itt_error_handler(__itt_error_code err, va_list args) {
+
+  switch (err) {
+  case __itt_error_no_module: {
+    char const *library = va_arg(args, char const *);
+#if KMP_OS_WINDOWS
+    int sys_err = va_arg(args, int);
+    kmp_msg_t err_code = KMP_SYSERRCODE(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(IttLoadLibFailed, library), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+#else
+    char const *sys_err = va_arg(args, char const *);
+    kmp_msg_t err_code = KMP_SYSERRMESG(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(IttLoadLibFailed, library), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+#endif
+  } break;
+  case __itt_error_no_symbol: {
+    char const *library = va_arg(args, char const *);
+    char const *symbol = va_arg(args, char const *);
+    KMP_WARNING(IttLookupFailed, symbol, library);
+  } break;
+  case __itt_error_unknown_group: {
+    char const *var = va_arg(args, char const *);
+    char const *group = va_arg(args, char const *);
+    KMP_WARNING(IttUnknownGroup, var, group);
+  } break;
+  case __itt_error_env_too_long: {
+    char const *var = va_arg(args, char const *);
+    size_t act_len = va_arg(args, size_t);
+    size_t max_len = va_arg(args, size_t);
+    KMP_WARNING(IttEnvVarTooLong, var, (unsigned long)act_len,
+                (unsigned long)max_len);
+  } break;
+  case __itt_error_cant_read_env: {
+    char const *var = va_arg(args, char const *);
+    int sys_err = va_arg(args, int);
+    kmp_msg_t err_code = KMP_ERR(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(CantGetEnvVar, var), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  } break;
+  case __itt_error_system: {
+    char const *func = va_arg(args, char const *);
+    int sys_err = va_arg(args, int);
+    kmp_msg_t err_code = KMP_SYSERRCODE(sys_err);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(IttFunctionError, func), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  } break;
+  default: { KMP_WARNING(IttUnknownError, err); }
+  }
+} // __itt_error_handler
+
+#endif /* USE_ITT_BUILD */
diff --git a/final/runtime/src/kmp_itt.h b/final/runtime/src/kmp_itt.h
new file mode 100644
index 0000000..b14a193
--- /dev/null
+++ b/final/runtime/src/kmp_itt.h
@@ -0,0 +1,332 @@
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.h -- ITT Notify interface.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_ITT_H
+#define KMP_ITT_H
+
+#include "kmp_lock.h"
+
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#include "ittnotify.h"
+#include "legacy/ittnotify.h"
+
+#if KMP_DEBUG
+#define __kmp_inline // Turn off inlining in debug mode.
+#else
+#define __kmp_inline static inline
+#endif
+
+#if USE_ITT_NOTIFY
+extern kmp_int32 __kmp_itt_prepare_delay;
+#ifdef __cplusplus
+extern "C" void __kmp_itt_fini_ittlib(void);
+#else
+extern void __kmp_itt_fini_ittlib(void);
+#endif
+#endif
+
+// Simplify the handling of an argument that is only required when USE_ITT_BUILD
+// is enabled.
+#define USE_ITT_BUILD_ARG(x) , x
+
+void __kmp_itt_initialize();
+void __kmp_itt_destroy();
+void __kmp_itt_reset();
+
+// -----------------------------------------------------------------------------
+// New stuff for reporting high-level constructs.
+
+// Note the naming convention:
+//     __kmp_itt_xxxing() function should be called before action, while
+//     __kmp_itt_xxxed()  function should be called after action.
+
+// --- Parallel region reporting ---
+__kmp_inline void
+__kmp_itt_region_forking(int gtid, int team_size,
+                         int barriers); // Master only, before forking threads.
+__kmp_inline void
+__kmp_itt_region_joined(int gtid); // Master only, after joining threads.
+// (*) Note: A thread may execute tasks after this point, though.
+
+// --- Frame reporting ---
+// region=0: no regions, region=1: parallel, region=2: serialized parallel
+__kmp_inline void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin,
+                                         __itt_timestamp end, int imbalance,
+                                         ident_t *loc, int team_size,
+                                         int region = 0);
+
+// --- Metadata reporting ---
+// begin/end - begin/end timestamps of a barrier frame, imbalance - aggregated
+// wait time value, reduction -if this is a reduction barrier
+__kmp_inline void __kmp_itt_metadata_imbalance(int gtid, kmp_uint64 begin,
+                                               kmp_uint64 end,
+                                               kmp_uint64 imbalance,
+                                               kmp_uint64 reduction);
+// sched_type: 0 - static, 1 - dynamic, 2 - guided, 3 - custom (all others);
+// iterations - loop trip count, chunk - chunk size
+__kmp_inline void __kmp_itt_metadata_loop(ident_t *loc, kmp_uint64 sched_type,
+                                          kmp_uint64 iterations,
+                                          kmp_uint64 chunk);
+__kmp_inline void __kmp_itt_metadata_single(ident_t *loc);
+
+// --- Barrier reporting ---
+__kmp_inline void *__kmp_itt_barrier_object(int gtid, int bt, int set_name = 0,
+                                            int delta = 0);
+__kmp_inline void __kmp_itt_barrier_starting(int gtid, void *object);
+__kmp_inline void __kmp_itt_barrier_middle(int gtid, void *object);
+__kmp_inline void __kmp_itt_barrier_finished(int gtid, void *object);
+
+// --- Taskwait reporting ---
+__kmp_inline void *__kmp_itt_taskwait_object(int gtid);
+__kmp_inline void __kmp_itt_taskwait_starting(int gtid, void *object);
+__kmp_inline void __kmp_itt_taskwait_finished(int gtid, void *object);
+
+// --- Task reporting ---
+__kmp_inline void __kmp_itt_task_starting(void *object);
+__kmp_inline void __kmp_itt_task_finished(void *object);
+
+// --- Lock reporting ---
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_inline void __kmp_itt_lock_creating(kmp_user_lock_p lock,
+                                          const ident_t *);
+#else
+__kmp_inline void __kmp_itt_lock_creating(kmp_user_lock_p lock);
+#endif
+__kmp_inline void __kmp_itt_lock_acquiring(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_acquired(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_releasing(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_cancelled(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_lock_destroyed(kmp_user_lock_p lock);
+
+// --- Critical reporting ---
+#if KMP_USE_DYNAMIC_LOCK
+__kmp_inline void __kmp_itt_critical_creating(kmp_user_lock_p lock,
+                                              const ident_t *);
+#else
+__kmp_inline void __kmp_itt_critical_creating(kmp_user_lock_p lock);
+#endif
+__kmp_inline void __kmp_itt_critical_acquiring(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_critical_acquired(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_critical_releasing(kmp_user_lock_p lock);
+__kmp_inline void __kmp_itt_critical_destroyed(kmp_user_lock_p lock);
+
+// --- Single reporting ---
+__kmp_inline void __kmp_itt_single_start(int gtid);
+__kmp_inline void __kmp_itt_single_end(int gtid);
+
+// --- Ordered reporting ---
+__kmp_inline void __kmp_itt_ordered_init(int gtid);
+__kmp_inline void __kmp_itt_ordered_prep(int gtid);
+__kmp_inline void __kmp_itt_ordered_start(int gtid);
+__kmp_inline void __kmp_itt_ordered_end(int gtid);
+
+// --- Threads reporting ---
+__kmp_inline void __kmp_itt_thread_ignore();
+__kmp_inline void __kmp_itt_thread_name(int gtid);
+
+// --- System objects ---
+__kmp_inline void __kmp_itt_system_object_created(void *object,
+                                                  char const *name);
+
+// --- Stack stitching ---
+__kmp_inline __itt_caller __kmp_itt_stack_caller_create(void);
+__kmp_inline void __kmp_itt_stack_caller_destroy(__itt_caller);
+__kmp_inline void __kmp_itt_stack_callee_enter(__itt_caller);
+__kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
+
+// -----------------------------------------------------------------------------
+// Old stuff for reporting low-level internal synchronization.
+
+#if USE_ITT_NOTIFY
+
+/* Support for SSC marks, which are used by SDE
+   http://software.intel.com/en-us/articles/intel-software-development-emulator
+   to mark points in instruction traces that represent spin-loops and are
+   therefore uninteresting when collecting traces for architecture simulation.
+ */
+#ifndef INCLUDE_SSC_MARKS
+#define INCLUDE_SSC_MARKS (KMP_OS_LINUX && KMP_ARCH_X86_64)
+#endif
+
+/* Linux 64 only for now */
+#if (INCLUDE_SSC_MARKS && KMP_OS_LINUX && KMP_ARCH_X86_64)
+// Portable (at least for gcc and icc) code to insert the necessary instructions
+// to set %ebx and execute the unlikely no-op.
+#if defined(__INTEL_COMPILER)
+#define INSERT_SSC_MARK(tag) __SSC_MARK(tag)
+#else
+#define INSERT_SSC_MARK(tag)                                                   \
+  __asm__ __volatile__("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(tag)    \
+                       : "%ebx")
+#endif
+#else
+#define INSERT_SSC_MARK(tag) ((void)0)
+#endif
+
+/* Markers for the start and end of regions that represent polling and are
+   therefore uninteresting to architectural simulations 0x4376 and 0x4377 are
+   arbitrary numbers that should be unique in the space of SSC tags, but there
+   is no central issuing authority rather randomness is expected to work. */
+#define SSC_MARK_SPIN_START() INSERT_SSC_MARK(0x4376)
+#define SSC_MARK_SPIN_END() INSERT_SSC_MARK(0x4377)
+
+// Markers for architecture simulation.
+// FORKING      : Before the master thread forks.
+// JOINING      : At the start of the join.
+// INVOKING     : Before the threads invoke microtasks.
+// DISPATCH_INIT: At the start of dynamically scheduled loop.
+// DISPATCH_NEXT: After claming next iteration of dynamically scheduled loop.
+#define SSC_MARK_FORKING() INSERT_SSC_MARK(0xd693)
+#define SSC_MARK_JOINING() INSERT_SSC_MARK(0xd694)
+#define SSC_MARK_INVOKING() INSERT_SSC_MARK(0xd695)
+#define SSC_MARK_DISPATCH_INIT() INSERT_SSC_MARK(0xd696)
+#define SSC_MARK_DISPATCH_NEXT() INSERT_SSC_MARK(0xd697)
+
+// The object is an address that associates a specific set of the prepare,
+// acquire, release, and cancel operations.
+
+/* Sync prepare indicates a thread is going to start waiting for another thread
+   to send a release event.  This operation should be done just before the
+   thread begins checking for the existence of the release event */
+
+/* Sync cancel indicates a thread is cancelling a wait on another thread and
+   continuing execution without waiting for the other thread to release it */
+
+/* Sync acquired indicates a thread has received a release event from another
+   thread and has stopped waiting.  This operation must occur only after the
+   release event is received. */
+
+/* Sync release indicates a thread is going to send a release event to another
+   thread so it will stop waiting and continue execution. This operation must
+   just happen before the release event. */
+
+#define KMP_FSYNC_PREPARE(obj) __itt_fsync_prepare((void *)(obj))
+#define KMP_FSYNC_CANCEL(obj) __itt_fsync_cancel((void *)(obj))
+#define KMP_FSYNC_ACQUIRED(obj) __itt_fsync_acquired((void *)(obj))
+#define KMP_FSYNC_RELEASING(obj) __itt_fsync_releasing((void *)(obj))
+
+/* In case of waiting in a spin loop, ITT wants KMP_FSYNC_PREPARE() to be called
+   with a delay (and not called at all if waiting time is small). So, in spin
+   loops, do not use KMP_FSYNC_PREPARE(), but use KMP_FSYNC_SPIN_INIT() (before
+   spin loop), KMP_FSYNC_SPIN_PREPARE() (whithin the spin loop), and
+   KMP_FSYNC_SPIN_ACQUIRED(). See KMP_WAIT() for example. */
+
+#undef KMP_FSYNC_SPIN_INIT
+#define KMP_FSYNC_SPIN_INIT(obj, spin)                                         \
+  int sync_iters = 0;                                                          \
+  if (__itt_fsync_prepare_ptr) {                                               \
+    if (obj == NULL) {                                                         \
+      obj = spin;                                                              \
+    } /* if */                                                                 \
+  } /* if */                                                                   \
+  SSC_MARK_SPIN_START()
+
+#undef KMP_FSYNC_SPIN_PREPARE
+#define KMP_FSYNC_SPIN_PREPARE(obj)                                            \
+  do {                                                                         \
+    if (__itt_fsync_prepare_ptr && sync_iters < __kmp_itt_prepare_delay) {     \
+      ++sync_iters;                                                            \
+      if (sync_iters >= __kmp_itt_prepare_delay) {                             \
+        KMP_FSYNC_PREPARE((void *)obj);                                        \
+      } /* if */                                                               \
+    } /* if */                                                                 \
+  } while (0)
+#undef KMP_FSYNC_SPIN_ACQUIRED
+#define KMP_FSYNC_SPIN_ACQUIRED(obj)                                           \
+  do {                                                                         \
+    SSC_MARK_SPIN_END();                                                       \
+    if (sync_iters >= __kmp_itt_prepare_delay) {                               \
+      KMP_FSYNC_ACQUIRED((void *)obj);                                         \
+    } /* if */                                                                 \
+  } while (0)
+
+/* ITT will not report objects created within KMP_ITT_IGNORE(), e. g.:
+       KMP_ITT_IGNORE(
+           ptr = malloc( size );
+       );
+*/
+#define KMP_ITT_IGNORE(statement)                                              \
+  do {                                                                         \
+    __itt_state_t __itt_state_;                                                \
+    if (__itt_state_get_ptr) {                                                 \
+      __itt_state_ = __itt_state_get();                                        \
+      __itt_obj_mode_set(__itt_obj_prop_ignore, __itt_obj_state_set);          \
+    } /* if */                                                                 \
+    { statement }                                                              \
+    if (__itt_state_get_ptr) {                                                 \
+      __itt_state_set(__itt_state_);                                           \
+    } /* if */                                                                 \
+  } while (0)
+
+const int KMP_MAX_FRAME_DOMAINS =
+    512; // Maximum number of frame domains to use (maps to
+// different OpenMP regions in the user source code).
+extern kmp_int32 __kmp_barrier_domain_count;
+extern kmp_int32 __kmp_region_domain_count;
+extern __itt_domain *__kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
+extern __itt_domain *__kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
+extern __itt_domain *__kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
+extern kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+extern __itt_domain *metadata_domain;
+extern __itt_string_handle *string_handle_imbl;
+extern __itt_string_handle *string_handle_loop;
+extern __itt_string_handle *string_handle_sngl;
+
+#else
+
+// Null definitions of the synchronization tracing functions.
+#define KMP_FSYNC_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_CANCEL(obj) ((void)0)
+#define KMP_FSYNC_ACQUIRED(obj) ((void)0)
+#define KMP_FSYNC_RELEASING(obj) ((void)0)
+
+#define KMP_FSYNC_SPIN_INIT(obj, spin) ((void)0)
+#define KMP_FSYNC_SPIN_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_SPIN_ACQUIRED(obj) ((void)0)
+
+#define KMP_ITT_IGNORE(stmt)                                                   \
+  do {                                                                         \
+    stmt                                                                       \
+  } while (0)
+
+#endif // USE_ITT_NOTIFY
+
+#if !KMP_DEBUG
+// In release mode include definitions of inline functions.
+#include "kmp_itt.inl"
+#endif
+
+#endif // KMP_ITT_H
+
+#else /* USE_ITT_BUILD */
+
+// Null definitions of the synchronization tracing functions.
+// If USE_ITT_BULID is not enabled, USE_ITT_NOTIFY cannot be either.
+// By defining these we avoid unpleasant ifdef tests in many places.
+#define KMP_FSYNC_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_CANCEL(obj) ((void)0)
+#define KMP_FSYNC_ACQUIRED(obj) ((void)0)
+#define KMP_FSYNC_RELEASING(obj) ((void)0)
+
+#define KMP_FSYNC_SPIN_INIT(obj, spin) ((void)0)
+#define KMP_FSYNC_SPIN_PREPARE(obj) ((void)0)
+#define KMP_FSYNC_SPIN_ACQUIRED(obj) ((void)0)
+
+#define KMP_ITT_IGNORE(stmt)                                                   \
+  do {                                                                         \
+    stmt                                                                       \
+  } while (0)
+
+#define USE_ITT_BUILD_ARG(x)
+
+#endif /* USE_ITT_BUILD */
diff --git a/final/runtime/src/kmp_itt.inl b/final/runtime/src/kmp_itt.inl
new file mode 100644
index 0000000..6e37ce0
--- /dev/null
+++ b/final/runtime/src/kmp_itt.inl
@@ -0,0 +1,1042 @@
+#if USE_ITT_BUILD
+/*
+ * kmp_itt.inl -- Inline functions of ITT Notify.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Inline function definitions. This file should be included into kmp_itt.h file
+// for production build (to let compliler inline functions) or into kmp_itt.c
+// file for debug build (to reduce the number of files to recompile and save
+// build time).
+
+#include "kmp.h"
+#include "kmp_str.h"
+
+#if KMP_ITT_DEBUG
+extern kmp_bootstrap_lock_t __kmp_itt_debug_lock;
+#define KMP_ITT_DEBUG_LOCK()                                                   \
+  { __kmp_acquire_bootstrap_lock(&__kmp_itt_debug_lock); }
+#define KMP_ITT_DEBUG_PRINT(...)                                               \
+  {                                                                            \
+    fprintf(stderr, "#%02d: ", __kmp_get_gtid());                              \
+    fprintf(stderr, __VA_ARGS__);                                              \
+    fflush(stderr);                                                            \
+    __kmp_release_bootstrap_lock(&__kmp_itt_debug_lock);                       \
+  }
+#else
+#define KMP_ITT_DEBUG_LOCK()
+#define KMP_ITT_DEBUG_PRINT(...)
+#endif // KMP_ITT_DEBUG
+
+// Ensure that the functions are static if they're supposed to be being inlined.
+// Otherwise they cannot be used in more than one file, since there will be
+// multiple definitions.
+#if KMP_DEBUG
+#define LINKAGE
+#else
+#define LINKAGE static inline
+#endif
+
+// ZCA interface used by Intel(R) Inspector. Intel(R) Parallel Amplifier uses
+// this API to support user-defined synchronization primitives, but does not use
+// ZCA; it would be safe to turn this off until wider support becomes available.
+#if USE_ITT_ZCA
+#ifdef __INTEL_COMPILER
+#if __INTEL_COMPILER >= 1200
+#undef __itt_sync_acquired
+#undef __itt_sync_releasing
+#define __itt_sync_acquired(addr)                                              \
+  __notify_zc_intrinsic((char *)"sync_acquired", addr)
+#define __itt_sync_releasing(addr)                                             \
+  __notify_intrinsic((char *)"sync_releasing", addr)
+#endif
+#endif
+#endif
+
+static kmp_bootstrap_lock_t metadata_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(metadata_lock);
+
+/* Parallel region reporting.
+ * __kmp_itt_region_forking should be called by master thread of a team.
+   Exact moment of call does not matter, but it should be completed before any
+   thread of this team calls __kmp_itt_region_starting.
+ * __kmp_itt_region_starting should be called by each thread of a team just
+   before entering parallel region body.
+ * __kmp_itt_region_finished should be called by each thread of a team right
+   after returning from parallel region body.
+ * __kmp_itt_region_joined should be called by master thread of a team, after
+   all threads called __kmp_itt_region_finished.
+
+ Note: Thread waiting at join barrier (after __kmp_itt_region_finished) can
+ execute some more user code -- such a thread can execute tasks.
+
+ Note: The overhead of logging region_starting and region_finished in each
+ thread is too large, so these calls are not used. */
+
+LINKAGE void __kmp_itt_region_forking(int gtid, int team_size, int barriers) {
+#if USE_ITT_NOTIFY
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+  if (team->t.t_active_level > 1) {
+    // The frame notifications are only supported for the outermost teams.
+    return;
+  }
+  ident_t *loc = __kmp_thread_from_gtid(gtid)->th.th_ident;
+  if (loc) {
+    // Use the reserved_2 field to store the index to the region domain.
+    // Assume that reserved_2 contains zero initially.  Since zero is special
+    // value here, store the index into domain array increased by 1.
+    if (loc->reserved_2 == 0) {
+      if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) {
+        int frm =
+            KMP_TEST_THEN_INC32(&__kmp_region_domain_count); // get "old" value
+        if (frm >= KMP_MAX_FRAME_DOMAINS) {
+          KMP_TEST_THEN_DEC32(&__kmp_region_domain_count); // revert the count
+          return; // loc->reserved_2 is still 0
+        }
+        // if (!KMP_COMPARE_AND_STORE_ACQ32( &loc->reserved_2, 0, frm + 1 )) {
+        //    frm = loc->reserved_2 - 1;   // get value saved by other thread
+        //    for same loc
+        //} // AC: this block is to replace next unsynchronized line
+
+        // We need to save indexes for both region and barrier frames. We'll use
+        // loc->reserved_2 field but put region index to the low two bytes and
+        // barrier indexes to the high two bytes. It is OK because
+        // KMP_MAX_FRAME_DOMAINS = 512.
+        loc->reserved_2 |= (frm + 1); // save "new" value
+
+        // Transform compiler-generated region location into the format
+        // that the tools more or less standardized on:
+        //   "<func>$omp$parallel@[file:]<line>[:<col>]"
+        char *buff = NULL;
+        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
+                                team_size, str_loc.file, str_loc.line,
+                                str_loc.col);
+
+        __itt_suppress_push(__itt_suppress_memory_errors);
+        __kmp_itt_region_domains[frm] = __itt_domain_create(buff);
+        __itt_suppress_pop();
+
+        __kmp_str_free(&buff);
+        if (barriers) {
+          if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) {
+            int frm = KMP_TEST_THEN_INC32(
+                &__kmp_barrier_domain_count); // get "old" value
+            if (frm >= KMP_MAX_FRAME_DOMAINS) {
+              KMP_TEST_THEN_DEC32(
+                  &__kmp_barrier_domain_count); // revert the count
+              return; // loc->reserved_2 is still 0
+            }
+            char *buff = NULL;
+            buff = __kmp_str_format("%s$omp$barrier@%s:%d", str_loc.func,
+                                    str_loc.file, str_loc.col);
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            __kmp_itt_barrier_domains[frm] = __itt_domain_create(buff);
+            __itt_suppress_pop();
+            __kmp_str_free(&buff);
+            // Save the barrier frame index to the high two bytes.
+            loc->reserved_2 |= (frm + 1) << 16;
+          }
+        }
+        __kmp_str_loc_free(&str_loc);
+        __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL);
+      }
+    } else { // Region domain exists for this location
+      // Check if team size was changed. Then create new region domain for this
+      // location
+      unsigned int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+      if ((frm < KMP_MAX_FRAME_DOMAINS) &&
+          (__kmp_itt_region_team_size[frm] != team_size)) {
+        char *buff = NULL;
+        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
+                                team_size, str_loc.file, str_loc.line,
+                                str_loc.col);
+
+        __itt_suppress_push(__itt_suppress_memory_errors);
+        __kmp_itt_region_domains[frm] = __itt_domain_create(buff);
+        __itt_suppress_pop();
+
+        __kmp_str_free(&buff);
+        __kmp_str_loc_free(&str_loc);
+        __kmp_itt_region_team_size[frm] = team_size;
+        __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL);
+      } else { // Team size was not changed. Use existing domain.
+        __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL);
+      }
+    }
+    KMP_ITT_DEBUG_LOCK();
+    KMP_ITT_DEBUG_PRINT("[frm beg] gtid=%d, idx=%x, loc:%p\n", gtid,
+                        loc->reserved_2, loc);
+  }
+#endif
+} // __kmp_itt_region_forking
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_frame_submit(int gtid, __itt_timestamp begin,
+                                    __itt_timestamp end, int imbalance,
+                                    ident_t *loc, int team_size, int region) {
+#if USE_ITT_NOTIFY
+  if (region) {
+    kmp_team_t *team = __kmp_team_from_gtid(gtid);
+    int serialized = (region == 2 ? 1 : 0);
+    if (team->t.t_active_level + serialized > 1) {
+      // The frame notifications are only supported for the outermost teams.
+      return;
+    }
+    // Check region domain has not been created before. It's index is saved in
+    // the low two bytes.
+    if ((loc->reserved_2 & 0x0000FFFF) == 0) {
+      if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) {
+        int frm =
+            KMP_TEST_THEN_INC32(&__kmp_region_domain_count); // get "old" value
+        if (frm >= KMP_MAX_FRAME_DOMAINS) {
+          KMP_TEST_THEN_DEC32(&__kmp_region_domain_count); // revert the count
+          return; // loc->reserved_2 is still 0
+        }
+
+        // We need to save indexes for both region and barrier frames. We'll use
+        // loc->reserved_2 field but put region index to the low two bytes and
+        // barrier indexes to the high two bytes. It is OK because
+        // KMP_MAX_FRAME_DOMAINS = 512.
+        loc->reserved_2 |= (frm + 1); // save "new" value
+
+        // Transform compiler-generated region location into the format
+        // that the tools more or less standardized on:
+        //   "<func>$omp$parallel:team_size@[file:]<line>[:<col>]"
+        char *buff = NULL;
+        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
+                                team_size, str_loc.file, str_loc.line,
+                                str_loc.col);
+
+        __itt_suppress_push(__itt_suppress_memory_errors);
+        __kmp_itt_region_domains[frm] = __itt_domain_create(buff);
+        __itt_suppress_pop();
+
+        __kmp_str_free(&buff);
+        __kmp_str_loc_free(&str_loc);
+        __kmp_itt_region_team_size[frm] = team_size;
+        __itt_frame_submit_v3(__kmp_itt_region_domains[frm], NULL, begin, end);
+      }
+    } else { // Region domain exists for this location
+      // Check if team size was changed. Then create new region domain for this
+      // location
+      unsigned int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+      if ((frm < KMP_MAX_FRAME_DOMAINS) &&
+          (__kmp_itt_region_team_size[frm] != team_size)) {
+        char *buff = NULL;
+        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+        buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d", str_loc.func,
+                                team_size, str_loc.file, str_loc.line,
+                                str_loc.col);
+
+        __itt_suppress_push(__itt_suppress_memory_errors);
+        __kmp_itt_region_domains[frm] = __itt_domain_create(buff);
+        __itt_suppress_pop();
+
+        __kmp_str_free(&buff);
+        __kmp_str_loc_free(&str_loc);
+        __kmp_itt_region_team_size[frm] = team_size;
+        __itt_frame_submit_v3(__kmp_itt_region_domains[frm], NULL, begin, end);
+      } else { // Team size was not changed. Use existing domain.
+        __itt_frame_submit_v3(__kmp_itt_region_domains[frm], NULL, begin, end);
+      }
+    }
+    KMP_ITT_DEBUG_LOCK();
+    KMP_ITT_DEBUG_PRINT(
+        "[reg sub] gtid=%d, idx=%x, region:%d, loc:%p, beg:%llu, end:%llu\n",
+        gtid, loc->reserved_2, region, loc, begin, end);
+    return;
+  } else { // called for barrier reporting
+    if (loc) {
+      if ((loc->reserved_2 & 0xFFFF0000) == 0) {
+        if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) {
+          int frm = KMP_TEST_THEN_INC32(
+              &__kmp_barrier_domain_count); // get "old" value
+          if (frm >= KMP_MAX_FRAME_DOMAINS) {
+            KMP_TEST_THEN_DEC32(
+                &__kmp_barrier_domain_count); // revert the count
+            return; // loc->reserved_2 is still 0
+          }
+          // Save the barrier frame index to the high two bytes.
+          loc->reserved_2 |= (frm + 1) << 16; // save "new" value
+
+          // Transform compiler-generated region location into the format
+          // that the tools more or less standardized on:
+          //   "<func>$omp$frame@[file:]<line>[:<col>]"
+          kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+          if (imbalance) {
+            char *buff_imb = NULL;
+            buff_imb = __kmp_str_format("%s$omp$barrier-imbalance:%d@%s:%d",
+                                        str_loc.func, team_size, str_loc.file,
+                                        str_loc.col);
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            __kmp_itt_imbalance_domains[frm] = __itt_domain_create(buff_imb);
+            __itt_suppress_pop();
+            __itt_frame_submit_v3(__kmp_itt_imbalance_domains[frm], NULL, begin,
+                                  end);
+            __kmp_str_free(&buff_imb);
+          } else {
+            char *buff = NULL;
+            buff = __kmp_str_format("%s$omp$barrier@%s:%d", str_loc.func,
+                                    str_loc.file, str_loc.col);
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            __kmp_itt_barrier_domains[frm] = __itt_domain_create(buff);
+            __itt_suppress_pop();
+            __itt_frame_submit_v3(__kmp_itt_barrier_domains[frm], NULL, begin,
+                                  end);
+            __kmp_str_free(&buff);
+          }
+          __kmp_str_loc_free(&str_loc);
+        }
+      } else { // if it is not 0 then it should be <= KMP_MAX_FRAME_DOMAINS
+        if (imbalance) {
+          __itt_frame_submit_v3(
+              __kmp_itt_imbalance_domains[(loc->reserved_2 >> 16) - 1], NULL,
+              begin, end);
+        } else {
+          __itt_frame_submit_v3(
+              __kmp_itt_barrier_domains[(loc->reserved_2 >> 16) - 1], NULL,
+              begin, end);
+        }
+      }
+      KMP_ITT_DEBUG_LOCK();
+      KMP_ITT_DEBUG_PRINT(
+          "[frm sub] gtid=%d, idx=%x, loc:%p, beg:%llu, end:%llu\n", gtid,
+          loc->reserved_2, loc, begin, end);
+    }
+  }
+#endif
+} // __kmp_itt_frame_submit
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_metadata_imbalance(int gtid, kmp_uint64 begin,
+                                          kmp_uint64 end, kmp_uint64 imbalance,
+                                          kmp_uint64 reduction) {
+#if USE_ITT_NOTIFY
+  if (metadata_domain == NULL) {
+    __kmp_acquire_bootstrap_lock(&metadata_lock);
+    if (metadata_domain == NULL) {
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      metadata_domain = __itt_domain_create("OMP Metadata");
+      string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance");
+      string_handle_loop = __itt_string_handle_create("omp_metadata_loop");
+      string_handle_sngl = __itt_string_handle_create("omp_metadata_single");
+      __itt_suppress_pop();
+    }
+    __kmp_release_bootstrap_lock(&metadata_lock);
+  }
+
+  kmp_uint64 imbalance_data[4];
+  imbalance_data[0] = begin;
+  imbalance_data[1] = end;
+  imbalance_data[2] = imbalance;
+  imbalance_data[3] = reduction;
+
+  __itt_metadata_add(metadata_domain, __itt_null, string_handle_imbl,
+                     __itt_metadata_u64, 4, imbalance_data);
+#endif
+} // __kmp_itt_metadata_imbalance
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_metadata_loop(ident_t *loc, kmp_uint64 sched_type,
+                                     kmp_uint64 iterations, kmp_uint64 chunk) {
+#if USE_ITT_NOTIFY
+  if (metadata_domain == NULL) {
+    __kmp_acquire_bootstrap_lock(&metadata_lock);
+    if (metadata_domain == NULL) {
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      metadata_domain = __itt_domain_create("OMP Metadata");
+      string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance");
+      string_handle_loop = __itt_string_handle_create("omp_metadata_loop");
+      string_handle_sngl = __itt_string_handle_create("omp_metadata_single");
+      __itt_suppress_pop();
+    }
+    __kmp_release_bootstrap_lock(&metadata_lock);
+  }
+
+  // Parse line and column from psource string: ";file;func;line;col;;"
+  char *s_line;
+  char *s_col;
+  KMP_DEBUG_ASSERT(loc->psource);
+#ifdef __cplusplus
+  s_line = strchr(CCAST(char *, loc->psource), ';');
+#else
+  s_line = strchr(loc->psource, ';');
+#endif
+  KMP_DEBUG_ASSERT(s_line);
+  s_line = strchr(s_line + 1, ';'); // 2-nd semicolon
+  KMP_DEBUG_ASSERT(s_line);
+  s_line = strchr(s_line + 1, ';'); // 3-rd semicolon
+  KMP_DEBUG_ASSERT(s_line);
+  s_col = strchr(s_line + 1, ';'); // 4-th semicolon
+  KMP_DEBUG_ASSERT(s_col);
+
+  kmp_uint64 loop_data[5];
+  loop_data[0] = atoi(s_line + 1); // read line
+  loop_data[1] = atoi(s_col + 1); // read column
+  loop_data[2] = sched_type;
+  loop_data[3] = iterations;
+  loop_data[4] = chunk;
+
+  __itt_metadata_add(metadata_domain, __itt_null, string_handle_loop,
+                     __itt_metadata_u64, 5, loop_data);
+#endif
+} // __kmp_itt_metadata_loop
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_metadata_single(ident_t *loc) {
+#if USE_ITT_NOTIFY
+  if (metadata_domain == NULL) {
+    __kmp_acquire_bootstrap_lock(&metadata_lock);
+    if (metadata_domain == NULL) {
+      __itt_suppress_push(__itt_suppress_memory_errors);
+      metadata_domain = __itt_domain_create("OMP Metadata");
+      string_handle_imbl = __itt_string_handle_create("omp_metadata_imbalance");
+      string_handle_loop = __itt_string_handle_create("omp_metadata_loop");
+      string_handle_sngl = __itt_string_handle_create("omp_metadata_single");
+      __itt_suppress_pop();
+    }
+    __kmp_release_bootstrap_lock(&metadata_lock);
+  }
+
+  kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 1);
+  kmp_uint64 single_data[2];
+  single_data[0] = str_loc.line;
+  single_data[1] = str_loc.col;
+
+  __kmp_str_loc_free(&str_loc);
+
+  __itt_metadata_add(metadata_domain, __itt_null, string_handle_sngl,
+                     __itt_metadata_u64, 2, single_data);
+#endif
+} // __kmp_itt_metadata_single
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_region_starting(int gtid) {
+#if USE_ITT_NOTIFY
+#endif
+} // __kmp_itt_region_starting
+
+// -----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_region_finished(int gtid) {
+#if USE_ITT_NOTIFY
+#endif
+} // __kmp_itt_region_finished
+
+// ----------------------------------------------------------------------------
+LINKAGE void __kmp_itt_region_joined(int gtid) {
+#if USE_ITT_NOTIFY
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+  if (team->t.t_active_level > 1) {
+    // The frame notifications are only supported for the outermost teams.
+    return;
+  }
+  ident_t *loc = __kmp_thread_from_gtid(gtid)->th.th_ident;
+  if (loc && loc->reserved_2) {
+    unsigned int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+    if (frm < KMP_MAX_FRAME_DOMAINS) {
+      KMP_ITT_DEBUG_LOCK();
+      __itt_frame_end_v3(__kmp_itt_region_domains[frm], NULL);
+      KMP_ITT_DEBUG_PRINT("[frm end] gtid=%d, idx=%x, loc:%p\n", gtid,
+                          loc->reserved_2, loc);
+    }
+  }
+#endif
+} // __kmp_itt_region_joined
+
+/* Barriers reporting.
+
+   A barrier consists of two phases:
+   1. Gather -- master waits for arriving of all the worker threads; each
+      worker thread registers arrival and goes further.
+   2. Release -- each worker threads waits until master lets it go; master lets
+      worker threads go.
+
+   Function should be called by each thread:
+   * __kmp_itt_barrier_starting() -- before arriving to the gather phase.
+   * __kmp_itt_barrier_middle()   -- between gather and release phases.
+   * __kmp_itt_barrier_finished() -- after release phase.
+
+   Note: Call __kmp_itt_barrier_object() before call to
+   __kmp_itt_barrier_starting() and save result in local variable.
+   __kmp_itt_barrier_object(), being called too late (e. g. after gather phase)
+   would return itt sync object for the next barrier!
+
+   ITT need an address (void *) to be specified as a sync object. OpenMP RTL
+   does not have barrier object or barrier data structure. Barrier is just a
+   counter in team and thread structures. We could use an address of team
+   structure as an barrier sync object, but ITT wants different objects for
+   different barriers (even whithin the same team). So let us use team address
+   as barrier sync object for the first barrier, then increase it by one for the
+   next barrier, and so on (but wrap it not to use addresses outside of team
+   structure). */
+
+void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
+                               int delta // 0 (current barrier) is default
+                               // value; specify -1 to get previous
+                               // barrier.
+                               ) {
+  void *object = NULL;
+#if USE_ITT_NOTIFY
+  kmp_info_t *thr = __kmp_thread_from_gtid(gtid);
+  kmp_team_t *team = thr->th.th_team;
+
+  // NOTE: If the function is called from __kmp_fork_barrier, team pointer can
+  // be NULL. This "if" helps to avoid crash. However, this is not complete
+  // solution, and reporting fork/join barriers to ITT should be revisited.
+
+  if (team != NULL) {
+    // Master thread increases b_arrived by KMP_BARRIER_STATE_BUMP each time.
+    // Divide b_arrived by KMP_BARRIER_STATE_BUMP to get plain barrier counter.
+    kmp_uint64 counter =
+        team->t.t_bar[bt].b_arrived / KMP_BARRIER_STATE_BUMP + delta;
+    // Now form the barrier id. Encode barrier type (bt) in barrier id too, so
+    // barriers of different types do not have the same ids.
+    KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= bs_last_barrier);
+    // This conditon is a must (we would have zero divide otherwise).
+    KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= 2 * bs_last_barrier);
+    // More strong condition: make sure we have room at least for for two
+    // differtent ids (for each barrier type).
+    object = reinterpret_cast<void *>(
+        kmp_uintptr_t(team) +
+        counter % (sizeof(kmp_team_t) / bs_last_barrier) * bs_last_barrier +
+        bt);
+    KMP_ITT_DEBUG_LOCK();
+    KMP_ITT_DEBUG_PRINT("[bar obj] type=%d, counter=%lld, object=%p\n", bt,
+                        counter, object);
+
+    if (set_name) {
+      ident_t const *loc = NULL;
+      char const *src = NULL;
+      char const *type = "OMP Barrier";
+      switch (bt) {
+      case bs_plain_barrier: {
+        // For plain barrier compiler calls __kmpc_barrier() function, which
+        // saves location in thr->th.th_ident.
+        loc = thr->th.th_ident;
+        // Get the barrier type from flags provided by compiler.
+        kmp_int32 expl = 0;
+        kmp_uint32 impl = 0;
+        if (loc != NULL) {
+          src = loc->psource;
+          expl = (loc->flags & KMP_IDENT_BARRIER_EXPL) != 0;
+          impl = (loc->flags & KMP_IDENT_BARRIER_IMPL) != 0;
+        }
+        if (impl) {
+          switch (loc->flags & KMP_IDENT_BARRIER_IMPL_MASK) {
+          case KMP_IDENT_BARRIER_IMPL_FOR: {
+            type = "OMP For Barrier";
+          } break;
+          case KMP_IDENT_BARRIER_IMPL_SECTIONS: {
+            type = "OMP Sections Barrier";
+          } break;
+          case KMP_IDENT_BARRIER_IMPL_SINGLE: {
+            type = "OMP Single Barrier";
+          } break;
+          case KMP_IDENT_BARRIER_IMPL_WORKSHARE: {
+            type = "OMP Workshare Barrier";
+          } break;
+          default: {
+            type = "OMP Implicit Barrier";
+            KMP_DEBUG_ASSERT(0);
+          }
+          }
+        } else if (expl) {
+          type = "OMP Explicit Barrier";
+        }
+      } break;
+      case bs_forkjoin_barrier: {
+        // In case of fork/join barrier we can read thr->th.th_ident, because it
+        // contains location of last passed construct (while join barrier is not
+        // such one). Use th_ident of master thread instead -- __kmp_join_call()
+        // called by the master thread saves location.
+        //
+        // AC: cannot read from master because __kmp_join_call may be not called
+        //    yet, so we read the location from team. This is the same location.
+        //    And team is valid at the enter to join barrier where this happens.
+        loc = team->t.t_ident;
+        if (loc != NULL) {
+          src = loc->psource;
+        }
+        type = "OMP Join Barrier";
+      } break;
+      }
+      KMP_ITT_DEBUG_LOCK();
+      __itt_sync_create(object, type, src, __itt_attr_barrier);
+      KMP_ITT_DEBUG_PRINT(
+          "[bar sta] scre( %p, \"%s\", \"%s\", __itt_attr_barrier )\n", object,
+          type, src);
+    }
+  }
+#endif
+  return object;
+} // __kmp_itt_barrier_object
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_barrier_starting(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  if (!KMP_MASTER_GTID(gtid)) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_releasing(object);
+    KMP_ITT_DEBUG_PRINT("[bar sta] srel( %p )\n", object);
+  }
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_prepare(object);
+  KMP_ITT_DEBUG_PRINT("[bar sta] spre( %p )\n", object);
+#endif
+} // __kmp_itt_barrier_starting
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_barrier_middle(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  if (KMP_MASTER_GTID(gtid)) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_acquired(object);
+    KMP_ITT_DEBUG_PRINT("[bar mid] sacq( %p )\n", object);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_releasing(object);
+    KMP_ITT_DEBUG_PRINT("[bar mid] srel( %p )\n", object);
+  } else {
+  }
+#endif
+} // __kmp_itt_barrier_middle
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_barrier_finished(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  if (KMP_MASTER_GTID(gtid)) {
+  } else {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_acquired(object);
+    KMP_ITT_DEBUG_PRINT("[bar end] sacq( %p )\n", object);
+  }
+#endif
+} // __kmp_itt_barrier_finished
+
+/* Taskwait reporting.
+   ITT need an address (void *) to be specified as a sync object. OpenMP RTL
+   does not have taskwait structure, so we need to construct something. */
+
+void *__kmp_itt_taskwait_object(int gtid) {
+  void *object = NULL;
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+    kmp_taskdata_t *taskdata = thread->th.th_current_task;
+    object = reinterpret_cast<void *>(kmp_uintptr_t(taskdata) +
+                                      taskdata->td_taskwait_counter %
+                                          sizeof(kmp_taskdata_t));
+  }
+#endif
+  return object;
+} // __kmp_itt_taskwait_object
+
+void __kmp_itt_taskwait_starting(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+  ident_t const *loc = taskdata->td_taskwait_ident;
+  char const *src = (loc == NULL ? NULL : loc->psource);
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_create(object, "OMP Taskwait", src, 0);
+  KMP_ITT_DEBUG_PRINT("[twa sta] scre( %p, \"OMP Taskwait\", \"%s\", 0 )\n",
+                      object, src);
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_prepare(object);
+  KMP_ITT_DEBUG_PRINT("[twa sta] spre( %p )\n", object);
+#endif
+} // __kmp_itt_taskwait_starting
+
+void __kmp_itt_taskwait_finished(int gtid, void *object) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_acquired(object);
+  KMP_ITT_DEBUG_PRINT("[twa end] sacq( %p )\n", object);
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_destroy(object);
+  KMP_ITT_DEBUG_PRINT("[twa end] sdes( %p )\n", object);
+#endif
+} // __kmp_itt_taskwait_finished
+
+/* Task reporting.
+   Only those tasks are reported which are executed by a thread spinning at
+   barrier (or taskwait). Synch object passed to the function must be barrier of
+   taskwait the threads waiting at. */
+
+void __kmp_itt_task_starting(
+    void *object // ITT sync object: barrier or taskwait.
+    ) {
+#if USE_ITT_NOTIFY
+  if (object != NULL) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_cancel(object);
+    KMP_ITT_DEBUG_PRINT("[tsk sta] scan( %p )\n", object);
+  }
+#endif
+} // __kmp_itt_task_starting
+
+// -----------------------------------------------------------------------------
+void __kmp_itt_task_finished(
+    void *object // ITT sync object: barrier or taskwait.
+    ) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_prepare(object);
+  KMP_ITT_DEBUG_PRINT("[tsk end] spre( %p )\n", object);
+#endif
+} // __kmp_itt_task_finished
+
+/* Lock reporting.
+ * __kmp_itt_lock_creating( lock ) should be called *before* the first lock
+   operation (set/unset). It is not a real event shown to the user but just
+   setting a name for synchronization object. `lock' is an address of sync
+   object, the same address should be used in all subsequent calls.
+ * __kmp_itt_lock_acquiring() should be called before setting the lock.
+ * __kmp_itt_lock_acquired() should be called after setting the lock.
+ * __kmp_itt_lock_realeasing() should be called before unsetting the lock.
+ * __kmp_itt_lock_cancelled() should be called after thread cancelled waiting
+   for the lock.
+ * __kmp_itt_lock_destroyed( lock ) should be called after the last lock
+   operation. After __kmp_itt_lock_destroyed() all the references to the same
+   address will be considered as another sync object, not related with the
+   original one.  */
+
+#if KMP_USE_DYNAMIC_LOCK
+// Takes location information directly
+__kmp_inline void ___kmp_itt_lock_init(kmp_user_lock_p lock, char const *type,
+                                       const ident_t *loc) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_create(lock, type, src, 0);
+    KMP_ITT_DEBUG_PRINT("[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type,
+                        src);
+  }
+#endif
+}
+#else // KMP_USE_DYNAMIC_LOCK
+// Internal guts -- common code for locks and critical sections, do not call
+// directly.
+__kmp_inline void ___kmp_itt_lock_init(kmp_user_lock_p lock, char const *type) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    ident_t const *loc = NULL;
+    if (__kmp_get_user_lock_location_ != NULL)
+      loc = __kmp_get_user_lock_location_((lock));
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_sync_create(lock, type, src, 0);
+    KMP_ITT_DEBUG_PRINT("[lck ini] scre( %p, \"%s\", \"%s\", 0 )\n", lock, type,
+                        src);
+  }
+#endif
+} // ___kmp_itt_lock_init
+#endif // KMP_USE_DYNAMIC_LOCK
+
+// Internal guts -- common code for locks and critical sections, do not call
+// directly.
+__kmp_inline void ___kmp_itt_lock_fini(kmp_user_lock_p lock, char const *type) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_destroy(lock);
+  KMP_ITT_DEBUG_PRINT("[lck dst] sdes( %p )\n", lock);
+#endif
+} // ___kmp_itt_lock_fini
+
+// -----------------------------------------------------------------------------
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_itt_lock_creating(kmp_user_lock_p lock, const ident_t *loc) {
+  ___kmp_itt_lock_init(lock, "OMP Lock", loc);
+}
+#else
+void __kmp_itt_lock_creating(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_init(lock, "OMP Lock");
+} // __kmp_itt_lock_creating
+#endif
+
+void __kmp_itt_lock_acquiring(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  // postpone lock object access
+  if (__itt_sync_prepare_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_prepare(ilk->lock);
+    } else {
+      __itt_sync_prepare(lock);
+    }
+  }
+#else
+  __itt_sync_prepare(lock);
+#endif
+} // __kmp_itt_lock_acquiring
+
+void __kmp_itt_lock_acquired(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  // postpone lock object access
+  if (__itt_sync_acquired_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_acquired(ilk->lock);
+    } else {
+      __itt_sync_acquired(lock);
+    }
+  }
+#else
+  __itt_sync_acquired(lock);
+#endif
+} // __kmp_itt_lock_acquired
+
+void __kmp_itt_lock_releasing(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  if (__itt_sync_releasing_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_releasing(ilk->lock);
+    } else {
+      __itt_sync_releasing(lock);
+    }
+  }
+#else
+  __itt_sync_releasing(lock);
+#endif
+} // __kmp_itt_lock_releasing
+
+void __kmp_itt_lock_cancelled(kmp_user_lock_p lock) {
+#if KMP_USE_DYNAMIC_LOCK && USE_ITT_NOTIFY
+  if (__itt_sync_cancel_ptr) {
+    if (KMP_EXTRACT_D_TAG(lock) == 0) {
+      kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+      __itt_sync_cancel(ilk->lock);
+    } else {
+      __itt_sync_cancel(lock);
+    }
+  }
+#else
+  __itt_sync_cancel(lock);
+#endif
+} // __kmp_itt_lock_cancelled
+
+void __kmp_itt_lock_destroyed(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_fini(lock, "OMP Lock");
+} // __kmp_itt_lock_destroyed
+
+/* Critical reporting.
+   Critical sections are treated exactly as locks (but have different object
+   type). */
+#if KMP_USE_DYNAMIC_LOCK
+void __kmp_itt_critical_creating(kmp_user_lock_p lock, const ident_t *loc) {
+  ___kmp_itt_lock_init(lock, "OMP Critical", loc);
+}
+#else
+void __kmp_itt_critical_creating(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_init(lock, "OMP Critical");
+} // __kmp_itt_critical_creating
+#endif
+
+void __kmp_itt_critical_acquiring(kmp_user_lock_p lock) {
+  __itt_sync_prepare(lock);
+} // __kmp_itt_critical_acquiring
+
+void __kmp_itt_critical_acquired(kmp_user_lock_p lock) {
+  __itt_sync_acquired(lock);
+} // __kmp_itt_critical_acquired
+
+void __kmp_itt_critical_releasing(kmp_user_lock_p lock) {
+  __itt_sync_releasing(lock);
+} // __kmp_itt_critical_releasing
+
+void __kmp_itt_critical_destroyed(kmp_user_lock_p lock) {
+  ___kmp_itt_lock_fini(lock, "OMP Critical");
+} // __kmp_itt_critical_destroyed
+
+/* Single reporting. */
+
+void __kmp_itt_single_start(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_mark_create_ptr || KMP_ITT_DEBUG) {
+    kmp_info_t *thr = __kmp_thread_from_gtid((gtid));
+    ident_t *loc = thr->th.th_ident;
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    kmp_str_buf_t name;
+    __kmp_str_buf_init(&name);
+    __kmp_str_buf_print(&name, "OMP Single-%s", src);
+    KMP_ITT_DEBUG_LOCK();
+    thr->th.th_itt_mark_single = __itt_mark_create(name.str);
+    KMP_ITT_DEBUG_PRINT("[sin sta] mcre( \"%s\") -> %d\n", name.str,
+                        thr->th.th_itt_mark_single);
+    __kmp_str_buf_free(&name);
+    KMP_ITT_DEBUG_LOCK();
+    __itt_mark(thr->th.th_itt_mark_single, NULL);
+    KMP_ITT_DEBUG_PRINT("[sin sta] mark( %d, NULL )\n",
+                        thr->th.th_itt_mark_single);
+  }
+#endif
+} // __kmp_itt_single_start
+
+void __kmp_itt_single_end(int gtid) {
+#if USE_ITT_NOTIFY
+  __itt_mark_type mark = __kmp_thread_from_gtid(gtid)->th.th_itt_mark_single;
+  KMP_ITT_DEBUG_LOCK();
+  __itt_mark_off(mark);
+  KMP_ITT_DEBUG_PRINT("[sin end] moff( %d )\n", mark);
+#endif
+} // __kmp_itt_single_end
+
+/* Ordered reporting.
+ * __kmp_itt_ordered_init is called by each thread *before* first using sync
+   object. ITT team would like it to be called once, but it requires extra
+   synchronization.
+ * __kmp_itt_ordered_prep is called when thread is going to enter ordered
+   section (before synchronization).
+ * __kmp_itt_ordered_start is called just before entering user code (after
+   synchronization).
+ * __kmp_itt_ordered_end is called after returning from user code.
+
+ Sync object is th->th.th_dispatch->th_dispatch_sh_current.
+ Events are not generated in case of serialized team. */
+
+void __kmp_itt_ordered_init(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_info_t *thr = __kmp_thread_from_gtid(gtid);
+    ident_t const *loc = thr->th.th_ident;
+    char const *src = (loc == NULL ? NULL : loc->psource);
+    __itt_sync_create(thr->th.th_dispatch->th_dispatch_sh_current,
+                      "OMP Ordered", src, 0);
+  }
+#endif
+} // __kmp_itt_ordered_init
+
+void __kmp_itt_ordered_prep(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_team_t *t = __kmp_team_from_gtid(gtid);
+    if (!t->t.t_serialized) {
+      kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+      __itt_sync_prepare(th->th.th_dispatch->th_dispatch_sh_current);
+    }
+  }
+#endif
+} // __kmp_itt_ordered_prep
+
+void __kmp_itt_ordered_start(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_team_t *t = __kmp_team_from_gtid(gtid);
+    if (!t->t.t_serialized) {
+      kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+      __itt_sync_acquired(th->th.th_dispatch->th_dispatch_sh_current);
+    }
+  }
+#endif
+} // __kmp_itt_ordered_start
+
+void __kmp_itt_ordered_end(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_sync_create_ptr) {
+    kmp_team_t *t = __kmp_team_from_gtid(gtid);
+    if (!t->t.t_serialized) {
+      kmp_info_t *th = __kmp_thread_from_gtid(gtid);
+      __itt_sync_releasing(th->th.th_dispatch->th_dispatch_sh_current);
+    }
+  }
+#endif
+} // __kmp_itt_ordered_end
+
+/* Threads reporting. */
+
+void __kmp_itt_thread_ignore() {
+  __itt_thr_ignore();
+} // __kmp_itt_thread_ignore
+
+void __kmp_itt_thread_name(int gtid) {
+#if USE_ITT_NOTIFY
+  if (__itt_thr_name_set_ptr) {
+    kmp_str_buf_t name;
+    __kmp_str_buf_init(&name);
+    if (KMP_MASTER_GTID(gtid)) {
+      __kmp_str_buf_print(&name, "OMP Master Thread #%d", gtid);
+    } else {
+      __kmp_str_buf_print(&name, "OMP Worker Thread #%d", gtid);
+    }
+    KMP_ITT_DEBUG_LOCK();
+    __itt_thr_name_set(name.str, name.used);
+    KMP_ITT_DEBUG_PRINT("[thr nam] name( \"%s\")\n", name.str);
+    __kmp_str_buf_free(&name);
+  }
+#endif
+} // __kmp_itt_thread_name
+
+/* System object reporting.
+   ITT catches operations with system sync objects (like Windows* OS on IA-32
+   architecture API critical sections and events). We only need to specify
+   name ("OMP Scheduler") for the object to let ITT know it is an object used
+   by OpenMP RTL for internal purposes. */
+
+void __kmp_itt_system_object_created(void *object, char const *name) {
+#if USE_ITT_NOTIFY
+  KMP_ITT_DEBUG_LOCK();
+  __itt_sync_create(object, "OMP Scheduler", name, 0);
+  KMP_ITT_DEBUG_PRINT("[sys obj] scre( %p, \"OMP Scheduler\", \"%s\", 0 )\n",
+                      object, name);
+#endif
+} // __kmp_itt_system_object_created
+
+/* Stack stitching api.
+   Master calls "create" and put the stitching id into team structure.
+   Workers read the stitching id and call "enter" / "leave" api.
+   Master calls "destroy" at the end of the parallel region. */
+
+__itt_caller __kmp_itt_stack_caller_create() {
+#if USE_ITT_NOTIFY
+  if (!__itt_stack_caller_create_ptr)
+    return NULL;
+  KMP_ITT_DEBUG_LOCK();
+  __itt_caller id = __itt_stack_caller_create();
+  KMP_ITT_DEBUG_PRINT("[stk cre] %p\n", id);
+  return id;
+#endif
+  return NULL;
+}
+
+void __kmp_itt_stack_caller_destroy(__itt_caller id) {
+#if USE_ITT_NOTIFY
+  if (__itt_stack_caller_destroy_ptr) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_stack_caller_destroy(id);
+    KMP_ITT_DEBUG_PRINT("[stk des] %p\n", id);
+  }
+#endif
+}
+
+void __kmp_itt_stack_callee_enter(__itt_caller id) {
+#if USE_ITT_NOTIFY
+  if (__itt_stack_callee_enter_ptr) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_stack_callee_enter(id);
+    KMP_ITT_DEBUG_PRINT("[stk ent] %p\n", id);
+  }
+#endif
+}
+
+void __kmp_itt_stack_callee_leave(__itt_caller id) {
+#if USE_ITT_NOTIFY
+  if (__itt_stack_callee_leave_ptr) {
+    KMP_ITT_DEBUG_LOCK();
+    __itt_stack_callee_leave(id);
+    KMP_ITT_DEBUG_PRINT("[stk lea] %p\n", id);
+  }
+#endif
+}
+
+#endif /* USE_ITT_BUILD */
diff --git a/final/runtime/src/kmp_lock.cpp b/final/runtime/src/kmp_lock.cpp
new file mode 100644
index 0000000..78d63c6
--- /dev/null
+++ b/final/runtime/src/kmp_lock.cpp
@@ -0,0 +1,3942 @@
+/*
+ * kmp_lock.cpp -- lock-related functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+#include <atomic>
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_wait_release.h"
+#include "kmp_wrapper_getpid.h"
+
+#include "tsan_annotations.h"
+
+#if KMP_USE_FUTEX
+#include <sys/syscall.h>
+#include <unistd.h>
+// We should really include <futex.h>, but that causes compatibility problems on
+// different Linux* OS distributions that either require that you include (or
+// break when you try to include) <pci/types.h>. Since all we need is the two
+// macros below (which are part of the kernel ABI, so can't change) we just
+// define the constants here and don't include <futex.h>
+#ifndef FUTEX_WAIT
+#define FUTEX_WAIT 0
+#endif
+#ifndef FUTEX_WAKE
+#define FUTEX_WAKE 1
+#endif
+#endif
+
+/* Implement spin locks for internal library use.             */
+/* The algorithm implemented is Lamport's bakery lock [1974]. */
+
+void __kmp_validate_locks(void) {
+  int i;
+  kmp_uint32 x, y;
+
+  /* Check to make sure unsigned arithmetic does wraps properly */
+  x = ~((kmp_uint32)0) - 2;
+  y = x - 2;
+
+  for (i = 0; i < 8; ++i, ++x, ++y) {
+    kmp_uint32 z = (x - y);
+    KMP_ASSERT(z == 2);
+  }
+
+  KMP_ASSERT(offsetof(kmp_base_queuing_lock, tail_id) % 8 == 0);
+}
+
+/* ------------------------------------------------------------------------ */
+/* test and set locks */
+
+// For the non-nested locks, we can only assume that the first 4 bytes were
+// allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
+// compiler only allocates a 4 byte pointer on IA-32 architecture.  On
+// Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
+//
+// gcc reserves >= 8 bytes for nested locks, so we can assume that the
+// entire 8 bytes were allocated for nested locks on all 64-bit platforms.
+
+static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
+  return KMP_LOCK_STRIP(KMP_ATOMIC_LD_RLX(&lck->lk.poll)) - 1;
+}
+
+static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+__forceinline static int
+__kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_MB();
+
+#ifdef USE_LOCK_PROFILE
+  kmp_uint32 curr = KMP_LOCK_STRIP(lck->lk.poll);
+  if ((curr != 0) && (curr != gtid + 1))
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+
+  kmp_uint32 spins;
+  KMP_FSYNC_PREPARE(lck);
+  KMP_INIT_YIELD(spins);
+  kmp_backoff_t backoff = __kmp_spin_backoff_params;
+  do {
+    __kmp_spin_backoff(&backoff);
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+  } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
+           !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
+  KMP_FSYNC_ACQUIRED(lck);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
+  ANNOTATE_TAS_ACQUIRED(lck);
+  return retval;
+}
+
+static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+  return __kmp_acquire_tas_lock(lck, gtid);
+}
+
+int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+static int __kmp_test_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                           kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  return __kmp_test_tas_lock(lck, gtid);
+}
+
+int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KMP_FSYNC_RELEASING(lck);
+  ANNOTATE_TAS_RELEASED(lck);
+  KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KMP_YIELD_OVERSUB();
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) >= 0) &&
+      (__kmp_get_tas_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_tas_lock(lck, gtid);
+}
+
+void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
+  lck->lk.poll = KMP_LOCK_FREE(tas);
+}
+
+void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
+
+static void __kmp_destroy_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_tas_lock(lck);
+}
+
+// nested test and set locks
+
+int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_tas_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_tas_lock_timed_template(lck, gtid);
+    ANNOTATE_TAS_ACQUIRED(lck);
+    lck->lk.depth_locked = 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int __kmp_acquire_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                                     kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_tas_lock(lck, gtid);
+}
+
+int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_tas_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_tas_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                                  kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_tas_lock(lck, gtid);
+}
+
+int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    __kmp_release_tas_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
+                                                     kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_tas_lock(lck, gtid);
+}
+
+void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck) {
+  __kmp_init_tas_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
+  __kmp_destroy_tas_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void __kmp_destroy_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (!__kmp_is_tas_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_tas_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_tas_lock(lck);
+}
+
+#if KMP_USE_FUTEX
+
+/* ------------------------------------------------------------------------ */
+/* futex locks */
+
+// futex locks are really just test and set locks, with a different method
+// of handling contention.  They take the same amount of space as test and
+// set locks, and are allocated the same way (i.e. use the area allocated by
+// the compiler for non-nested locks / allocate nested locks on the heap).
+
+static kmp_int32 __kmp_get_futex_lock_owner(kmp_futex_lock_t *lck) {
+  return KMP_LOCK_STRIP((TCR_4(lck->lk.poll) >> 1)) - 1;
+}
+
+static inline bool __kmp_is_futex_lock_nestable(kmp_futex_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+__forceinline static int
+__kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  kmp_int32 gtid_code = (gtid + 1) << 1;
+
+  KMP_MB();
+
+#ifdef USE_LOCK_PROFILE
+  kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
+  if ((curr != 0) && (curr != gtid_code))
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  KMP_FSYNC_PREPARE(lck);
+  KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
+                  lck, lck->lk.poll, gtid));
+
+  kmp_int32 poll_val;
+
+  while ((poll_val = KMP_COMPARE_AND_STORE_RET32(
+              &(lck->lk.poll), KMP_LOCK_FREE(futex),
+              KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {
+
+    kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;
+    KA_TRACE(
+        1000,
+        ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
+         lck, gtid, poll_val, cond));
+
+    // NOTE: if you try to use the following condition for this branch
+    //
+    // if ( poll_val & 1 == 0 )
+    //
+    // Then the 12.0 compiler has a bug where the following block will
+    // always be skipped, regardless of the value of the LSB of poll_val.
+    if (!cond) {
+      // Try to set the lsb in the poll to indicate to the owner
+      // thread that they need to wake this thread up.
+      if (!KMP_COMPARE_AND_STORE_REL32(&(lck->lk.poll), poll_val,
+                                       poll_val | KMP_LOCK_BUSY(1, futex))) {
+        KA_TRACE(
+            1000,
+            ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
+             lck, lck->lk.poll, gtid));
+        continue;
+      }
+      poll_val |= KMP_LOCK_BUSY(1, futex);
+
+      KA_TRACE(1000,
+               ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n", lck,
+                lck->lk.poll, gtid));
+    }
+
+    KA_TRACE(
+        1000,
+        ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
+         lck, gtid, poll_val));
+
+    kmp_int32 rc;
+    if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
+                      NULL, 0)) != 0) {
+      KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
+                      "failed (rc=%d errno=%d)\n",
+                      lck, gtid, poll_val, rc, errno));
+      continue;
+    }
+
+    KA_TRACE(1000,
+             ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
+              lck, gtid, poll_val));
+    // This thread has now done a successful futex wait call and was entered on
+    // the OS futex queue.  We must now perform a futex wake call when releasing
+    // the lock, as we have no idea how many other threads are in the queue.
+    gtid_code |= 1;
+  }
+
+  KMP_FSYNC_ACQUIRED(lck);
+  KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
+                  lck->lk.poll, gtid));
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
+  ANNOTATE_FUTEX_ACQUIRED(lck);
+  return retval;
+}
+
+static int __kmp_acquire_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+  return __kmp_acquire_futex_lock(lck, gtid);
+}
+
+int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  if (KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(futex),
+                                  KMP_LOCK_BUSY((gtid + 1) << 1, futex))) {
+    KMP_FSYNC_ACQUIRED(lck);
+    return TRUE;
+  }
+  return FALSE;
+}
+
+static int __kmp_test_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                             kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  return __kmp_test_futex_lock(lck, gtid);
+}
+
+int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
+                  lck, lck->lk.poll, gtid));
+
+  KMP_FSYNC_RELEASING(lck);
+  ANNOTATE_FUTEX_RELEASED(lck);
+
+  kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
+
+  KA_TRACE(1000,
+           ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
+            lck, gtid, poll_val));
+
+  if (KMP_LOCK_STRIP(poll_val) & 1) {
+    KA_TRACE(1000,
+             ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
+              lck, gtid));
+    syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
+            NULL, NULL, 0);
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
+                  lck->lk.poll, gtid));
+
+  KMP_YIELD_OVERSUB();
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) >= 0) &&
+      (__kmp_get_futex_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_futex_lock(lck, gtid);
+}
+
+void __kmp_init_futex_lock(kmp_futex_lock_t *lck) {
+  TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
+}
+
+void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
+
+static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
+      __kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_futex_lock(lck);
+}
+
+// nested futex locks
+
+int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_futex_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_futex_lock_timed_template(lck, gtid);
+    ANNOTATE_FUTEX_ACQUIRED(lck);
+    lck->lk.depth_locked = 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int __kmp_acquire_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_futex_lock(lck, gtid);
+}
+
+int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_futex_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_futex_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                    kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_futex_lock(lck, gtid);
+}
+
+int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    __kmp_release_futex_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_futex_lock(lck, gtid);
+}
+
+void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck) {
+  __kmp_init_futex_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
+  __kmp_destroy_futex_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void __kmp_destroy_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (!__kmp_is_futex_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_futex_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_futex_lock(lck);
+}
+
+#endif // KMP_USE_FUTEX
+
+/* ------------------------------------------------------------------------ */
+/* ticket (bakery) locks */
+
+static kmp_int32 __kmp_get_ticket_lock_owner(kmp_ticket_lock_t *lck) {
+  return std::atomic_load_explicit(&lck->lk.owner_id,
+                                   std::memory_order_relaxed) -
+         1;
+}
+
+static inline bool __kmp_is_ticket_lock_nestable(kmp_ticket_lock_t *lck) {
+  return std::atomic_load_explicit(&lck->lk.depth_locked,
+                                   std::memory_order_relaxed) != -1;
+}
+
+static kmp_uint32 __kmp_bakery_check(void *now_serving, kmp_uint32 my_ticket) {
+  return std::atomic_load_explicit((std::atomic<unsigned> *)now_serving,
+                                   std::memory_order_acquire) == my_ticket;
+}
+
+__forceinline static int
+__kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
+                                         kmp_int32 gtid) {
+  kmp_uint32 my_ticket = std::atomic_fetch_add_explicit(
+      &lck->lk.next_ticket, 1U, std::memory_order_relaxed);
+
+#ifdef USE_LOCK_PROFILE
+  if (std::atomic_load_explicit(&lck->lk.now_serving,
+                                std::memory_order_relaxed) != my_ticket)
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  if (std::atomic_load_explicit(&lck->lk.now_serving,
+                                std::memory_order_acquire) == my_ticket) {
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+  KMP_WAIT_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
+  ANNOTATE_TICKET_ACQUIRED(lck);
+  return retval;
+}
+
+static int __kmp_acquire_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                 kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_ticket_lock(lck, gtid);
+
+  std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                             std::memory_order_relaxed);
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  kmp_uint32 my_ticket = std::atomic_load_explicit(&lck->lk.next_ticket,
+                                                   std::memory_order_relaxed);
+
+  if (std::atomic_load_explicit(&lck->lk.now_serving,
+                                std::memory_order_relaxed) == my_ticket) {
+    kmp_uint32 next_ticket = my_ticket + 1;
+    if (std::atomic_compare_exchange_strong_explicit(
+            &lck->lk.next_ticket, &my_ticket, next_ticket,
+            std::memory_order_acquire, std::memory_order_acquire)) {
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+
+  int retval = __kmp_test_ticket_lock(lck, gtid);
+
+  if (retval) {
+    std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                               std::memory_order_relaxed);
+  }
+  return retval;
+}
+
+int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
+                                                  std::memory_order_relaxed) -
+                        std::atomic_load_explicit(&lck->lk.now_serving,
+                                                  std::memory_order_relaxed);
+
+  ANNOTATE_TICKET_RELEASED(lck);
+  std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
+                                 std::memory_order_release);
+
+  KMP_YIELD(distance >
+            (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                 kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) >= 0) &&
+      (__kmp_get_ticket_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
+  return __kmp_release_ticket_lock(lck, gtid);
+}
+
+void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck) {
+  lck->lk.location = NULL;
+  lck->lk.self = lck;
+  std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.now_serving, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(
+      &lck->lk.owner_id, 0,
+      std::memory_order_relaxed); // no thread owns the lock.
+  std::atomic_store_explicit(
+      &lck->lk.depth_locked, -1,
+      std::memory_order_relaxed); // -1 => not a nested lock.
+  std::atomic_store_explicit(&lck->lk.initialized, true,
+                             std::memory_order_release);
+}
+
+void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
+  std::atomic_store_explicit(&lck->lk.initialized, false,
+                             std::memory_order_release);
+  lck->lk.self = NULL;
+  lck->lk.location = NULL;
+  std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.now_serving, 0U,
+                             std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
+  std::atomic_store_explicit(&lck->lk.depth_locked, -1,
+                             std::memory_order_relaxed);
+}
+
+static void __kmp_destroy_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_ticket_lock(lck);
+}
+
+// nested ticket locks
+
+int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_ticket_lock_owner(lck) == gtid) {
+    std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
+                                   std::memory_order_relaxed);
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_ticket_lock_timed_template(lck, gtid);
+    ANNOTATE_TICKET_ACQUIRED(lck);
+    std::atomic_store_explicit(&lck->lk.depth_locked, 1,
+                               std::memory_order_relaxed);
+    std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                               std::memory_order_relaxed);
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int __kmp_acquire_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                        kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_ticket_lock(lck, gtid);
+}
+
+int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_ticket_lock_owner(lck) == gtid) {
+    retval = std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
+                                            std::memory_order_relaxed) +
+             1;
+  } else if (!__kmp_test_ticket_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    std::atomic_store_explicit(&lck->lk.depth_locked, 1,
+                               std::memory_order_relaxed);
+    std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
+                               std::memory_order_relaxed);
+    retval = 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                     kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_ticket_lock(lck, gtid);
+}
+
+int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if ((std::atomic_fetch_add_explicit(&lck->lk.depth_locked, -1,
+                                      std::memory_order_relaxed) -
+       1) == 0) {
+    std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
+    __kmp_release_ticket_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
+                                                        kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_ticket_lock(lck, gtid);
+}
+
+void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+  std::atomic_store_explicit(&lck->lk.depth_locked, 0,
+                             std::memory_order_relaxed);
+  // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
+  __kmp_destroy_ticket_lock(lck);
+  std::atomic_store_explicit(&lck->lk.depth_locked, 0,
+                             std::memory_order_relaxed);
+}
+
+static void
+__kmp_destroy_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+
+  if (!std::atomic_load_explicit(&lck->lk.initialized,
+                                 std::memory_order_relaxed)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (lck->lk.self != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_ticket_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_ticket_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_ticket_lock(lck);
+}
+
+// access functions to fields which don't exist for all lock kinds.
+
+static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
+  return lck->lk.location;
+}
+
+static void __kmp_set_ticket_lock_location(kmp_ticket_lock_t *lck,
+                                           const ident_t *loc) {
+  lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t __kmp_get_ticket_lock_flags(kmp_ticket_lock_t *lck) {
+  return lck->lk.flags;
+}
+
+static void __kmp_set_ticket_lock_flags(kmp_ticket_lock_t *lck,
+                                        kmp_lock_flags_t flags) {
+  lck->lk.flags = flags;
+}
+
+/* ------------------------------------------------------------------------ */
+/* queuing locks */
+
+/* First the states
+   (head,tail) =              0, 0  means lock is unheld, nobody on queue
+                 UINT_MAX or -1, 0  means lock is held, nobody on queue
+                              h, h  means lock held or about to transition,
+                                    1 element on queue
+                              h, t  h <> t, means lock is held or about to
+                                    transition, >1 elements on queue
+
+   Now the transitions
+      Acquire(0,0)  = -1 ,0
+      Release(0,0)  = Error
+      Acquire(-1,0) =  h ,h    h > 0
+      Release(-1,0) =  0 ,0
+      Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
+      Release(h,h)  = -1 ,0    h > 0
+      Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
+      Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
+
+   And pictorially
+
+           +-----+
+           | 0, 0|------- release -------> Error
+           +-----+
+             |  ^
+      acquire|  |release
+             |  |
+             |  |
+             v  |
+           +-----+
+           |-1, 0|
+           +-----+
+             |  ^
+      acquire|  |release
+             |  |
+             |  |
+             v  |
+           +-----+
+           | h, h|
+           +-----+
+             |  ^
+      acquire|  |release
+             |  |
+             |  |
+             v  |
+           +-----+
+           | h, t|----- acquire, release loopback ---+
+           +-----+                                   |
+                ^                                    |
+                |                                    |
+                +------------------------------------+
+ */
+
+#ifdef DEBUG_QUEUING_LOCKS
+
+/* Stuff for circular trace buffer */
+#define TRACE_BUF_ELE 1024
+static char traces[TRACE_BUF_ELE][128] = {0};
+static int tc = 0;
+#define TRACE_LOCK(X, Y)                                                       \
+  KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s\n", X, Y);
+#define TRACE_LOCK_T(X, Y, Z)                                                  \
+  KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X, Y, Z);
+#define TRACE_LOCK_HT(X, Y, Z, Q)                                              \
+  KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y,   \
+               Z, Q);
+
+static void __kmp_dump_queuing_lock(kmp_info_t *this_thr, kmp_int32 gtid,
+                                    kmp_queuing_lock_t *lck, kmp_int32 head_id,
+                                    kmp_int32 tail_id) {
+  kmp_int32 t, i;
+
+  __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n");
+
+  i = tc % TRACE_BUF_ELE;
+  __kmp_printf_no_lock("%s\n", traces[i]);
+  i = (i + 1) % TRACE_BUF_ELE;
+  while (i != (tc % TRACE_BUF_ELE)) {
+    __kmp_printf_no_lock("%s", traces[i]);
+    i = (i + 1) % TRACE_BUF_ELE;
+  }
+  __kmp_printf_no_lock("\n");
+
+  __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, "
+                       "next_wait:%d, head_id:%d, tail_id:%d\n",
+                       gtid + 1, this_thr->th.th_spin_here,
+                       this_thr->th.th_next_waiting, head_id, tail_id);
+
+  __kmp_printf_no_lock("\t\thead: %d ", lck->lk.head_id);
+
+  if (lck->lk.head_id >= 1) {
+    t = __kmp_threads[lck->lk.head_id - 1]->th.th_next_waiting;
+    while (t > 0) {
+      __kmp_printf_no_lock("-> %d ", t);
+      t = __kmp_threads[t - 1]->th.th_next_waiting;
+    }
+  }
+  __kmp_printf_no_lock(";  tail: %d ", lck->lk.tail_id);
+  __kmp_printf_no_lock("\n\n");
+}
+
+#endif /* DEBUG_QUEUING_LOCKS */
+
+static kmp_int32 __kmp_get_queuing_lock_owner(kmp_queuing_lock_t *lck) {
+  return TCR_4(lck->lk.owner_id) - 1;
+}
+
+static inline bool __kmp_is_queuing_lock_nestable(kmp_queuing_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+/* Acquire a lock using a the queuing lock implementation */
+template <bool takeTime>
+/* [TLW] The unused template above is left behind because of what BEB believes
+   is a potential compiler problem with __forceinline. */
+__forceinline static int
+__kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
+                                          kmp_int32 gtid) {
+  kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
+  volatile kmp_int32 *head_id_p = &lck->lk.head_id;
+  volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
+  volatile kmp_uint32 *spin_here_p;
+  kmp_int32 need_mf = 1;
+
+#if OMPT_SUPPORT
+  ompt_state_t prev_state = ompt_state_undefined;
+#endif
+
+  KA_TRACE(1000,
+           ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
+
+  KMP_FSYNC_PREPARE(lck);
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+  spin_here_p = &this_thr->th.th_spin_here;
+
+#ifdef DEBUG_QUEUING_LOCKS
+  TRACE_LOCK(gtid + 1, "acq ent");
+  if (*spin_here_p)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+  if (this_thr->th.th_next_waiting != 0)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+#endif
+  KMP_DEBUG_ASSERT(!*spin_here_p);
+  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+
+  /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to
+     head_id_p that may follow, not just in execution order, but also in
+     visibility order. This way, when a releasing thread observes the changes to
+     the queue by this thread, it can rightly assume that spin_here_p has
+     already been set to TRUE, so that when it sets spin_here_p to FALSE, it is
+     not premature.  If the releasing thread sets spin_here_p to FALSE before
+     this thread sets it to TRUE, this thread will hang. */
+  *spin_here_p = TRUE; /* before enqueuing to prevent race */
+
+  while (1) {
+    kmp_int32 enqueued;
+    kmp_int32 head;
+    kmp_int32 tail;
+
+    head = *head_id_p;
+
+    switch (head) {
+
+    case -1: {
+#ifdef DEBUG_QUEUING_LOCKS
+      tail = *tail_id_p;
+      TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
+#endif
+      tail = 0; /* to make sure next link asynchronously read is not set
+                accidentally; this assignment prevents us from entering the
+                if ( t > 0 ) condition in the enqueued case below, which is not
+                necessary for this state transition */
+
+      need_mf = 0;
+      /* try (-1,0)->(tid,tid) */
+      enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
+                                             KMP_PACK_64(-1, 0),
+                                             KMP_PACK_64(gtid + 1, gtid + 1));
+#ifdef DEBUG_QUEUING_LOCKS
+      if (enqueued)
+        TRACE_LOCK(gtid + 1, "acq enq: (-1,0)->(tid,tid)");
+#endif
+    } break;
+
+    default: {
+      tail = *tail_id_p;
+      KMP_DEBUG_ASSERT(tail != gtid + 1);
+
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
+#endif
+
+      if (tail == 0) {
+        enqueued = FALSE;
+      } else {
+        need_mf = 0;
+        /* try (h,t) or (h,h)->(h,tid) */
+        enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
+
+#ifdef DEBUG_QUEUING_LOCKS
+        if (enqueued)
+          TRACE_LOCK(gtid + 1, "acq enq: (h,t)->(h,tid)");
+#endif
+      }
+    } break;
+
+    case 0: /* empty queue */
+    {
+      kmp_int32 grabbed_lock;
+
+#ifdef DEBUG_QUEUING_LOCKS
+      tail = *tail_id_p;
+      TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
+#endif
+      /* try (0,0)->(-1,0) */
+
+      /* only legal transition out of head = 0 is head = -1 with no change to
+       * tail */
+      grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1);
+
+      if (grabbed_lock) {
+
+        *spin_here_p = FALSE;
+
+        KA_TRACE(
+            1000,
+            ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
+             lck, gtid));
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK_HT(gtid + 1, "acq exit: ", head, 0);
+#endif
+
+#if OMPT_SUPPORT
+        if (ompt_enabled.enabled && prev_state != ompt_state_undefined) {
+          /* change the state before clearing wait_id */
+          this_thr->th.ompt_thread_info.state = prev_state;
+          this_thr->th.ompt_thread_info.wait_id = 0;
+        }
+#endif
+
+        KMP_FSYNC_ACQUIRED(lck);
+        return KMP_LOCK_ACQUIRED_FIRST; /* lock holder cannot be on queue */
+      }
+      enqueued = FALSE;
+    } break;
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled && prev_state == ompt_state_undefined) {
+      /* this thread will spin; set wait_id before entering wait state */
+      prev_state = this_thr->th.ompt_thread_info.state;
+      this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
+      this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
+    }
+#endif
+
+    if (enqueued) {
+      if (tail > 0) {
+        kmp_info_t *tail_thr = __kmp_thread_from_gtid(tail - 1);
+        KMP_ASSERT(tail_thr != NULL);
+        tail_thr->th.th_next_waiting = gtid + 1;
+        /* corresponding wait for this write in release code */
+      }
+      KA_TRACE(1000,
+               ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
+                lck, gtid));
+
+      KMP_MB();
+      // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
+      KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
+
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK(gtid + 1, "acq spin");
+
+      if (this_thr->th.th_next_waiting != 0)
+        __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+#endif
+      KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+      KA_TRACE(1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after "
+                      "waiting on queue\n",
+                      lck, gtid));
+
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK(gtid + 1, "acq exit 2");
+#endif
+
+#if OMPT_SUPPORT
+      /* change the state before clearing wait_id */
+      this_thr->th.ompt_thread_info.state = prev_state;
+      this_thr->th.ompt_thread_info.wait_id = 0;
+#endif
+
+      /* got lock, we were dequeued by the thread that released lock */
+      return KMP_LOCK_ACQUIRED_FIRST;
+    }
+
+    /* Yield if number of threads > number of logical processors */
+    /* ToDo: Not sure why this should only be in oversubscription case,
+       maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
+    KMP_YIELD_OVERSUB();
+
+#ifdef DEBUG_QUEUING_LOCKS
+    TRACE_LOCK(gtid + 1, "acq retry");
+#endif
+  }
+  KMP_ASSERT2(0, "should not get here");
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
+  ANNOTATE_QUEUING_ACQUIRED(lck);
+  return retval;
+}
+
+static int __kmp_acquire_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                  kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) == gtid) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_queuing_lock(lck, gtid);
+
+  lck->lk.owner_id = gtid + 1;
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  volatile kmp_int32 *head_id_p = &lck->lk.head_id;
+  kmp_int32 head;
+#ifdef KMP_DEBUG
+  kmp_info_t *this_thr;
+#endif
+
+  KA_TRACE(1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid));
+  KMP_DEBUG_ASSERT(gtid >= 0);
+#ifdef KMP_DEBUG
+  this_thr = __kmp_thread_from_gtid(gtid);
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
+#endif
+
+  head = *head_id_p;
+
+  if (head == 0) { /* nobody on queue, nobody holding */
+    /* try (0,0)->(-1,0) */
+    if (KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1)) {
+      KA_TRACE(1000,
+               ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
+      KMP_FSYNC_ACQUIRED(lck);
+      ANNOTATE_QUEUING_ACQUIRED(lck);
+      return TRUE;
+    }
+  }
+
+  KA_TRACE(1000,
+           ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid));
+  return FALSE;
+}
+
+static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                               kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+
+  int retval = __kmp_test_queuing_lock(lck, gtid);
+
+  if (retval) {
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  kmp_info_t *this_thr;
+  volatile kmp_int32 *head_id_p = &lck->lk.head_id;
+  volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
+
+  KA_TRACE(1000,
+           ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  this_thr = __kmp_thread_from_gtid(gtid);
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+#ifdef DEBUG_QUEUING_LOCKS
+  TRACE_LOCK(gtid + 1, "rel ent");
+
+  if (this_thr->th.th_spin_here)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+  if (this_thr->th.th_next_waiting != 0)
+    __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
+#endif
+  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
+  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+
+  KMP_FSYNC_RELEASING(lck);
+  ANNOTATE_QUEUING_RELEASED(lck);
+
+  while (1) {
+    kmp_int32 dequeued;
+    kmp_int32 head;
+    kmp_int32 tail;
+
+    head = *head_id_p;
+
+#ifdef DEBUG_QUEUING_LOCKS
+    tail = *tail_id_p;
+    TRACE_LOCK_HT(gtid + 1, "rel read: ", head, tail);
+    if (head == 0)
+      __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+    KMP_DEBUG_ASSERT(head !=
+                     0); /* holding the lock, head must be -1 or queue head */
+
+    if (head == -1) { /* nobody on queue */
+      /* try (-1,0)->(0,0) */
+      if (KMP_COMPARE_AND_STORE_REL32(head_id_p, -1, 0)) {
+        KA_TRACE(
+            1000,
+            ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
+             lck, gtid));
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK_HT(gtid + 1, "rel exit: ", 0, 0);
+#endif
+
+#if OMPT_SUPPORT
+/* nothing to do - no other thread is trying to shift blame */
+#endif
+        return KMP_LOCK_RELEASED;
+      }
+      dequeued = FALSE;
+    } else {
+      KMP_MB();
+      tail = *tail_id_p;
+      if (head == tail) { /* only one thread on the queue */
+#ifdef DEBUG_QUEUING_LOCKS
+        if (head <= 0)
+          __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+        KMP_DEBUG_ASSERT(head > 0);
+
+        /* try (h,h)->(-1,0) */
+        dequeued = KMP_COMPARE_AND_STORE_REL64(
+            RCAST(volatile kmp_int64 *, tail_id_p), KMP_PACK_64(head, head),
+            KMP_PACK_64(-1, 0));
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
+#endif
+
+      } else {
+        volatile kmp_int32 *waiting_id_p;
+        kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
+        KMP_DEBUG_ASSERT(head_thr != NULL);
+        waiting_id_p = &head_thr->th.th_next_waiting;
+
+/* Does this require synchronous reads? */
+#ifdef DEBUG_QUEUING_LOCKS
+        if (head <= 0 || tail <= 0)
+          __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+        KMP_DEBUG_ASSERT(head > 0 && tail > 0);
+
+        /* try (h,t)->(h',t) or (t,t) */
+        KMP_MB();
+        /* make sure enqueuing thread has time to update next waiting thread
+         * field */
+        *head_id_p =
+            KMP_WAIT((volatile kmp_uint32 *)waiting_id_p, 0, KMP_NEQ, NULL);
+#ifdef DEBUG_QUEUING_LOCKS
+        TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
+#endif
+        dequeued = TRUE;
+      }
+    }
+
+    if (dequeued) {
+      kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
+      KMP_DEBUG_ASSERT(head_thr != NULL);
+
+/* Does this require synchronous reads? */
+#ifdef DEBUG_QUEUING_LOCKS
+      if (head <= 0 || tail <= 0)
+        __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
+#endif
+      KMP_DEBUG_ASSERT(head > 0 && tail > 0);
+
+      /* For clean code only. Thread not released until next statement prevents
+         race with acquire code. */
+      head_thr->th.th_next_waiting = 0;
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK_T(gtid + 1, "rel nw=0 for t=", head);
+#endif
+
+      KMP_MB();
+      /* reset spin value */
+      head_thr->th.th_spin_here = FALSE;
+
+      KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after "
+                      "dequeuing\n",
+                      lck, gtid));
+#ifdef DEBUG_QUEUING_LOCKS
+      TRACE_LOCK(gtid + 1, "rel exit 2");
+#endif
+      return KMP_LOCK_RELEASED;
+    }
+/* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
+   threads */
+
+#ifdef DEBUG_QUEUING_LOCKS
+    TRACE_LOCK(gtid + 1, "rel retry");
+#endif
+
+  } /* while */
+  KMP_ASSERT2(0, "should not get here");
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                  kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  lck->lk.owner_id = 0;
+  return __kmp_release_queuing_lock(lck, gtid);
+}
+
+void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck) {
+  lck->lk.location = NULL;
+  lck->lk.head_id = 0;
+  lck->lk.tail_id = 0;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0; // no thread owns the lock.
+  lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
+  lck->lk.initialized = lck;
+
+  KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
+}
+
+void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
+  lck->lk.initialized = NULL;
+  lck->lk.location = NULL;
+  lck->lk.head_id = 0;
+  lck->lk.tail_id = 0;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0;
+  lck->lk.depth_locked = -1;
+}
+
+static void __kmp_destroy_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_queuing_lock(lck);
+}
+
+// nested queuing locks
+
+int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_queuing_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
+    ANNOTATE_QUEUING_ACQUIRED(lck);
+    KMP_MB();
+    lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static int
+__kmp_acquire_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_acquire_nested_queuing_lock(lck, gtid);
+}
+
+int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_queuing_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_queuing_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                                      kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_queuing_lock(lck, gtid);
+}
+
+int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    KMP_MB();
+    lck->lk.owner_id = 0;
+    __kmp_release_queuing_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int
+__kmp_release_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
+                                              kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_queuing_lock(lck, gtid);
+}
+
+void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
+  __kmp_destroy_queuing_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void
+__kmp_destroy_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_queuing_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_queuing_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_queuing_lock(lck);
+}
+
+// access functions to fields which don't exist for all lock kinds.
+
+static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
+  return lck->lk.location;
+}
+
+static void __kmp_set_queuing_lock_location(kmp_queuing_lock_t *lck,
+                                            const ident_t *loc) {
+  lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t __kmp_get_queuing_lock_flags(kmp_queuing_lock_t *lck) {
+  return lck->lk.flags;
+}
+
+static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
+                                         kmp_lock_flags_t flags) {
+  lck->lk.flags = flags;
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+/* RTM Adaptive locks */
+
+#if (KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300) ||                          \
+    (KMP_COMPILER_MSVC && _MSC_VER >= 1700) ||                                 \
+    (KMP_COMPILER_CLANG && KMP_MSVC_COMPAT)
+
+#include <immintrin.h>
+#define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
+
+#else
+
+// Values from the status register after failed speculation.
+#define _XBEGIN_STARTED (~0u)
+#define _XABORT_EXPLICIT (1 << 0)
+#define _XABORT_RETRY (1 << 1)
+#define _XABORT_CONFLICT (1 << 2)
+#define _XABORT_CAPACITY (1 << 3)
+#define _XABORT_DEBUG (1 << 4)
+#define _XABORT_NESTED (1 << 5)
+#define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
+
+// Aborts for which it's worth trying again immediately
+#define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
+
+#define STRINGIZE_INTERNAL(arg) #arg
+#define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
+
+// Access to RTM instructions
+/*A version of XBegin which returns -1 on speculation, and the value of EAX on
+  an abort. This is the same definition as the compiler intrinsic that will be
+  supported at some point. */
+static __inline int _xbegin() {
+  int res = -1;
+
+#if KMP_OS_WINDOWS
+#if KMP_ARCH_X86_64
+  _asm {
+        _emit 0xC7
+        _emit 0xF8
+        _emit 2
+        _emit 0
+        _emit 0
+        _emit 0
+        jmp   L2
+        mov   res, eax
+    L2:
+  }
+#else /* IA32 */
+  _asm {
+        _emit 0xC7
+        _emit 0xF8
+        _emit 2
+        _emit 0
+        _emit 0
+        _emit 0
+        jmp   L2
+        mov   res, eax
+    L2:
+  }
+#endif // KMP_ARCH_X86_64
+#else
+  /* Note that %eax must be noted as killed (clobbered), because the XSR is
+     returned in %eax(%rax) on abort.  Other register values are restored, so
+     don't need to be killed.
+
+     We must also mark 'res' as an input and an output, since otherwise
+     'res=-1' may be dropped as being dead, whereas we do need the assignment on
+     the successful (i.e., non-abort) path. */
+  __asm__ volatile("1: .byte  0xC7; .byte 0xF8;\n"
+                   "   .long  1f-1b-6\n"
+                   "    jmp   2f\n"
+                   "1:  movl  %%eax,%0\n"
+                   "2:"
+                   : "+r"(res)::"memory", "%eax");
+#endif // KMP_OS_WINDOWS
+  return res;
+}
+
+/* Transaction end */
+static __inline void _xend() {
+#if KMP_OS_WINDOWS
+  __asm {
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd5
+  }
+#else
+  __asm__ volatile(".byte 0x0f; .byte 0x01; .byte 0xd5" ::: "memory");
+#endif
+}
+
+/* This is a macro, the argument must be a single byte constant which can be
+   evaluated by the inline assembler, since it is emitted as a byte into the
+   assembly code. */
+// clang-format off
+#if KMP_OS_WINDOWS
+#define _xabort(ARG) _asm _emit 0xc6 _asm _emit 0xf8 _asm _emit ARG
+#else
+#define _xabort(ARG)                                                           \
+  __asm__ volatile(".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG):::"memory");
+#endif
+// clang-format on
+#endif // KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
+
+// Statistics is collected for testing purpose
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+// We accumulate speculative lock statistics when the lock is destroyed. We
+// keep locks that haven't been destroyed in the liveLocks list so that we can
+// grab their statistics too.
+static kmp_adaptive_lock_statistics_t destroyedStats;
+
+// To hold the list of live locks.
+static kmp_adaptive_lock_info_t liveLocks;
+
+// A lock so we can safely update the list of locks.
+static kmp_bootstrap_lock_t chain_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(chain_lock);
+
+// Initialize the list of stats.
+void __kmp_init_speculative_stats() {
+  kmp_adaptive_lock_info_t *lck = &liveLocks;
+
+  memset(CCAST(kmp_adaptive_lock_statistics_t *, &(lck->stats)), 0,
+         sizeof(lck->stats));
+  lck->stats.next = lck;
+  lck->stats.prev = lck;
+
+  KMP_ASSERT(lck->stats.next->stats.prev == lck);
+  KMP_ASSERT(lck->stats.prev->stats.next == lck);
+
+  __kmp_init_bootstrap_lock(&chain_lock);
+}
+
+// Insert the lock into the circular list
+static void __kmp_remember_lock(kmp_adaptive_lock_info_t *lck) {
+  __kmp_acquire_bootstrap_lock(&chain_lock);
+
+  lck->stats.next = liveLocks.stats.next;
+  lck->stats.prev = &liveLocks;
+
+  liveLocks.stats.next = lck;
+  lck->stats.next->stats.prev = lck;
+
+  KMP_ASSERT(lck->stats.next->stats.prev == lck);
+  KMP_ASSERT(lck->stats.prev->stats.next == lck);
+
+  __kmp_release_bootstrap_lock(&chain_lock);
+}
+
+static void __kmp_forget_lock(kmp_adaptive_lock_info_t *lck) {
+  KMP_ASSERT(lck->stats.next->stats.prev == lck);
+  KMP_ASSERT(lck->stats.prev->stats.next == lck);
+
+  kmp_adaptive_lock_info_t *n = lck->stats.next;
+  kmp_adaptive_lock_info_t *p = lck->stats.prev;
+
+  n->stats.prev = p;
+  p->stats.next = n;
+}
+
+static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
+  memset(CCAST(kmp_adaptive_lock_statistics_t *, &lck->stats), 0,
+         sizeof(lck->stats));
+  __kmp_remember_lock(lck);
+}
+
+static void __kmp_add_stats(kmp_adaptive_lock_statistics_t *t,
+                            kmp_adaptive_lock_info_t *lck) {
+  kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
+
+  t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
+  t->successfulSpeculations += s->successfulSpeculations;
+  t->hardFailedSpeculations += s->hardFailedSpeculations;
+  t->softFailedSpeculations += s->softFailedSpeculations;
+  t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
+  t->lemmingYields += s->lemmingYields;
+}
+
+static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
+  __kmp_acquire_bootstrap_lock(&chain_lock);
+
+  __kmp_add_stats(&destroyedStats, lck);
+  __kmp_forget_lock(lck);
+
+  __kmp_release_bootstrap_lock(&chain_lock);
+}
+
+static float percent(kmp_uint32 count, kmp_uint32 total) {
+  return (total == 0) ? 0.0 : (100.0 * count) / total;
+}
+
+static FILE *__kmp_open_stats_file() {
+  if (strcmp(__kmp_speculative_statsfile, "-") == 0)
+    return stdout;
+
+  size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
+  char buffer[buffLen];
+  KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
+               (kmp_int32)getpid());
+  FILE *result = fopen(&buffer[0], "w");
+
+  // Maybe we should issue a warning here...
+  return result ? result : stdout;
+}
+
+void __kmp_print_speculative_stats() {
+  kmp_adaptive_lock_statistics_t total = destroyedStats;
+  kmp_adaptive_lock_info_t *lck;
+
+  for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
+    __kmp_add_stats(&total, lck);
+  }
+  kmp_adaptive_lock_statistics_t *t = &total;
+  kmp_uint32 totalSections =
+      t->nonSpeculativeAcquires + t->successfulSpeculations;
+  kmp_uint32 totalSpeculations = t->successfulSpeculations +
+                                 t->hardFailedSpeculations +
+                                 t->softFailedSpeculations;
+  if (totalSections <= 0)
+    return;
+
+  FILE *statsFile = __kmp_open_stats_file();
+
+  fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
+  fprintf(statsFile, " Lock parameters: \n"
+                     "   max_soft_retries               : %10d\n"
+                     "   max_badness                    : %10d\n",
+          __kmp_adaptive_backoff_params.max_soft_retries,
+          __kmp_adaptive_backoff_params.max_badness);
+  fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
+          t->nonSpeculativeAcquireAttempts);
+  fprintf(statsFile, " Total critical sections          : %10d\n",
+          totalSections);
+  fprintf(statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
+          t->successfulSpeculations,
+          percent(t->successfulSpeculations, totalSections));
+  fprintf(statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
+          t->nonSpeculativeAcquires,
+          percent(t->nonSpeculativeAcquires, totalSections));
+  fprintf(statsFile, " Lemming yields                   : %10d\n\n",
+          t->lemmingYields);
+
+  fprintf(statsFile, " Speculative acquire attempts     : %10d\n",
+          totalSpeculations);
+  fprintf(statsFile, " Successes                        : %10d (%5.1f%%)\n",
+          t->successfulSpeculations,
+          percent(t->successfulSpeculations, totalSpeculations));
+  fprintf(statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
+          t->softFailedSpeculations,
+          percent(t->softFailedSpeculations, totalSpeculations));
+  fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
+          t->hardFailedSpeculations,
+          percent(t->hardFailedSpeculations, totalSpeculations));
+
+  if (statsFile != stdout)
+    fclose(statsFile);
+}
+
+#define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
+#else
+#define KMP_INC_STAT(lck, stat)
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) {
+  // It is enough to check that the head_id is zero.
+  // We don't also need to check the tail.
+  bool res = lck->lk.head_id == 0;
+
+// We need a fence here, since we must ensure that no memory operations
+// from later in this thread float above that read.
+#if KMP_COMPILER_ICC
+  _mm_mfence();
+#else
+  __sync_synchronize();
+#endif
+
+  return res;
+}
+
+// Functions for manipulating the badness
+static __inline void
+__kmp_update_badness_after_success(kmp_adaptive_lock_t *lck) {
+  // Reset the badness to zero so we eagerly try to speculate again
+  lck->lk.adaptive.badness = 0;
+  KMP_INC_STAT(lck, successfulSpeculations);
+}
+
+// Create a bit mask with one more set bit.
+static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
+  kmp_uint32 newBadness = (lck->lk.adaptive.badness << 1) | 1;
+  if (newBadness > lck->lk.adaptive.max_badness) {
+    return;
+  } else {
+    lck->lk.adaptive.badness = newBadness;
+  }
+}
+
+// Check whether speculation should be attempted.
+static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
+                                           kmp_int32 gtid) {
+  kmp_uint32 badness = lck->lk.adaptive.badness;
+  kmp_uint32 attempts = lck->lk.adaptive.acquire_attempts;
+  int res = (attempts & badness) == 0;
+  return res;
+}
+
+// Attempt to acquire only the speculative lock.
+// Does not back off to the non-speculative lock.
+static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
+                                         kmp_int32 gtid) {
+  int retries = lck->lk.adaptive.max_soft_retries;
+
+  // We don't explicitly count the start of speculation, rather we record the
+  // results (success, hard fail, soft fail). The sum of all of those is the
+  // total number of times we started speculation since all speculations must
+  // end one of those ways.
+  do {
+    kmp_uint32 status = _xbegin();
+    // Switch this in to disable actual speculation but exercise at least some
+    // of the rest of the code. Useful for debugging...
+    // kmp_uint32 status = _XABORT_NESTED;
+
+    if (status == _XBEGIN_STARTED) {
+      /* We have successfully started speculation. Check that no-one acquired
+         the lock for real between when we last looked and now. This also gets
+         the lock cache line into our read-set, which we need so that we'll
+         abort if anyone later claims it for real. */
+      if (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
+        // Lock is now visibly acquired, so someone beat us to it. Abort the
+        // transaction so we'll restart from _xbegin with the failure status.
+        _xabort(0x01);
+        KMP_ASSERT2(0, "should not get here");
+      }
+      return 1; // Lock has been acquired (speculatively)
+    } else {
+      // We have aborted, update the statistics
+      if (status & SOFT_ABORT_MASK) {
+        KMP_INC_STAT(lck, softFailedSpeculations);
+        // and loop round to retry.
+      } else {
+        KMP_INC_STAT(lck, hardFailedSpeculations);
+        // Give up if we had a hard failure.
+        break;
+      }
+    }
+  } while (retries--); // Loop while we have retries, and didn't fail hard.
+
+  // Either we had a hard failure or we didn't succeed softly after
+  // the full set of attempts, so back off the badness.
+  __kmp_step_badness(lck);
+  return 0;
+}
+
+// Attempt to acquire the speculative lock, or back off to the non-speculative
+// one if the speculative lock cannot be acquired.
+// We can succeed speculatively, non-speculatively, or fail.
+static int __kmp_test_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) {
+  // First try to acquire the lock speculatively
+  if (__kmp_should_speculate(lck, gtid) &&
+      __kmp_test_adaptive_lock_only(lck, gtid))
+    return 1;
+
+  // Speculative acquisition failed, so try to acquire it non-speculatively.
+  // Count the non-speculative acquire attempt
+  lck->lk.adaptive.acquire_attempts++;
+
+  // Use base, non-speculative lock.
+  if (__kmp_test_queuing_lock(GET_QLK_PTR(lck), gtid)) {
+    KMP_INC_STAT(lck, nonSpeculativeAcquires);
+    return 1; // Lock is acquired (non-speculatively)
+  } else {
+    return 0; // Failed to acquire the lock, it's already visibly locked.
+  }
+}
+
+static int __kmp_test_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+
+  int retval = __kmp_test_adaptive_lock(lck, gtid);
+
+  if (retval) {
+    lck->lk.qlk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+// Block until we can acquire a speculative, adaptive lock. We check whether we
+// should be trying to speculate. If we should be, we check the real lock to see
+// if it is free, and, if not, pause without attempting to acquire it until it
+// is. Then we try the speculative acquire. This means that although we suffer
+// from lemmings a little (because all we can't acquire the lock speculatively
+// until the queue of threads waiting has cleared), we don't get into a state
+// where we can never acquire the lock speculatively (because we force the queue
+// to clear by preventing new arrivals from entering the queue). This does mean
+// that when we're trying to break lemmings, the lock is no longer fair. However
+// OpenMP makes no guarantee that its locks are fair, so this isn't a real
+// problem.
+static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
+                                        kmp_int32 gtid) {
+  if (__kmp_should_speculate(lck, gtid)) {
+    if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
+      if (__kmp_test_adaptive_lock_only(lck, gtid))
+        return;
+      // We tried speculation and failed, so give up.
+    } else {
+      // We can't try speculation until the lock is free, so we pause here
+      // (without suspending on the queueing lock, to allow it to drain, then
+      // try again. All other threads will also see the same result for
+      // shouldSpeculate, so will be doing the same if they try to claim the
+      // lock from now on.
+      while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
+        KMP_INC_STAT(lck, lemmingYields);
+        KMP_YIELD(TRUE);
+      }
+
+      if (__kmp_test_adaptive_lock_only(lck, gtid))
+        return;
+    }
+  }
+
+  // Speculative acquisition failed, so acquire it non-speculatively.
+  // Count the non-speculative acquire attempt
+  lck->lk.adaptive.acquire_attempts++;
+
+  __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
+  // We have acquired the base lock, so count that.
+  KMP_INC_STAT(lck, nonSpeculativeAcquires);
+  ANNOTATE_QUEUING_ACQUIRED(lck);
+}
+
+static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
+                                                    kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == gtid) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_adaptive_lock(lck, gtid);
+
+  lck->lk.qlk.owner_id = gtid + 1;
+}
+
+static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
+                                       kmp_int32 gtid) {
+  if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
+          lck))) { // If the lock doesn't look claimed we must be speculating.
+    // (Or the user's code is buggy and they're releasing without locking;
+    // if we had XTEST we'd be able to check that case...)
+    _xend(); // Exit speculation
+    __kmp_update_badness_after_success(lck);
+  } else { // Since the lock *is* visibly locked we're not speculating,
+    // so should use the underlying lock's release scheme.
+    __kmp_release_queuing_lock(GET_QLK_PTR(lck), gtid);
+  }
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
+                                                   kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  lck->lk.qlk.owner_id = 0;
+  __kmp_release_adaptive_lock(lck, gtid);
+  return KMP_LOCK_RELEASED;
+}
+
+static void __kmp_init_adaptive_lock(kmp_adaptive_lock_t *lck) {
+  __kmp_init_queuing_lock(GET_QLK_PTR(lck));
+  lck->lk.adaptive.badness = 0;
+  lck->lk.adaptive.acquire_attempts = 0; // nonSpeculativeAcquireAttempts = 0;
+  lck->lk.adaptive.max_soft_retries =
+      __kmp_adaptive_backoff_params.max_soft_retries;
+  lck->lk.adaptive.max_badness = __kmp_adaptive_backoff_params.max_badness;
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_zero_speculative_stats(&lck->lk.adaptive);
+#endif
+  KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
+}
+
+static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
+#endif
+  __kmp_destroy_queuing_lock(GET_QLK_PTR(lck));
+  // Nothing needed for the speculative part.
+}
+
+static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_adaptive_lock(lck);
+}
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+/* ------------------------------------------------------------------------ */
+/* DRDPA ticket locks                                                */
+/* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
+
+static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
+  return lck->lk.owner_id - 1;
+}
+
+static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
+  return lck->lk.depth_locked != -1;
+}
+
+__forceinline static int
+__kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  kmp_uint64 ticket = KMP_ATOMIC_INC(&lck->lk.next_ticket);
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
+
+#ifdef USE_LOCK_PROFILE
+  if (polls[ticket & mask] != ticket)
+    __kmp_printf("LOCK CONTENTION: %p\n", lck);
+/* else __kmp_printf( "." );*/
+#endif /* USE_LOCK_PROFILE */
+
+  // Now spin-wait, but reload the polls pointer and mask, in case the
+  // polling area has been reconfigured.  Unless it is reconfigured, the
+  // reloads stay in L1 cache and are cheap.
+  //
+  // Keep this code in sync with KMP_WAIT, in kmp_dispatch.cpp !!!
+  // The current implementation of KMP_WAIT doesn't allow for mask
+  // and poll to be re-read every spin iteration.
+  kmp_uint32 spins;
+  KMP_FSYNC_PREPARE(lck);
+  KMP_INIT_YIELD(spins);
+  while (polls[ticket & mask] < ticket) { // atomic load
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    // Re-read the mask and the poll pointer from the lock structure.
+    //
+    // Make certain that "mask" is read before "polls" !!!
+    //
+    // If another thread picks reconfigures the polling area and updates their
+    // values, and we get the new value of mask and the old polls pointer, we
+    // could access memory beyond the end of the old polling area.
+    mask = lck->lk.mask; // atomic load
+    polls = lck->lk.polls; // atomic load
+  }
+
+  // Critical section starts here
+  KMP_FSYNC_ACQUIRED(lck);
+  KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
+                  ticket, lck));
+  lck->lk.now_serving = ticket; // non-volatile store
+
+  // Deallocate a garbage polling area if we know that we are the last
+  // thread that could possibly access it.
+  //
+  // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
+  // ticket.
+  if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
+    __kmp_free(lck->lk.old_polls);
+    lck->lk.old_polls = NULL;
+    lck->lk.cleanup_ticket = 0;
+  }
+
+  // Check to see if we should reconfigure the polling area.
+  // If there is still a garbage polling area to be deallocated from a
+  // previous reconfiguration, let a later thread reconfigure it.
+  if (lck->lk.old_polls == NULL) {
+    bool reconfigure = false;
+    std::atomic<kmp_uint64> *old_polls = polls;
+    kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
+
+    if (TCR_4(__kmp_nth) >
+        (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
+      // We are in oversubscription mode.  Contract the polling area
+      // down to a single location, if that hasn't been done already.
+      if (num_polls > 1) {
+        reconfigure = true;
+        num_polls = TCR_4(lck->lk.num_polls);
+        mask = 0;
+        num_polls = 1;
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
+        polls[0] = ticket;
+      }
+    } else {
+      // We are in under/fully subscribed mode.  Check the number of
+      // threads waiting on the lock.  The size of the polling area
+      // should be at least the number of threads waiting.
+      kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
+      if (num_waiting > num_polls) {
+        kmp_uint32 old_num_polls = num_polls;
+        reconfigure = true;
+        do {
+          mask = (mask << 1) | 1;
+          num_polls *= 2;
+        } while (num_polls <= num_waiting);
+
+        // Allocate the new polling area, and copy the relevant portion
+        // of the old polling area to the new area.  __kmp_allocate()
+        // zeroes the memory it allocates, and most of the old area is
+        // just zero padding, so we only copy the release counters.
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
+        kmp_uint32 i;
+        for (i = 0; i < old_num_polls; i++) {
+          polls[i].store(old_polls[i]);
+        }
+      }
+    }
+
+    if (reconfigure) {
+      // Now write the updated fields back to the lock structure.
+      //
+      // Make certain that "polls" is written before "mask" !!!
+      //
+      // If another thread picks up the new value of mask and the old polls
+      // pointer , it could access memory beyond the end of the old polling
+      // area.
+      //
+      // On x86, we need memory fences.
+      KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring "
+                      "lock %p to %d polls\n",
+                      ticket, lck, num_polls));
+
+      lck->lk.old_polls = old_polls;
+      lck->lk.polls = polls; // atomic store
+
+      KMP_MB();
+
+      lck->lk.num_polls = num_polls;
+      lck->lk.mask = mask; // atomic store
+
+      KMP_MB();
+
+      // Only after the new polling area and mask have been flushed
+      // to main memory can we update the cleanup ticket field.
+      //
+      // volatile load / non-volatile store
+      lck->lk.cleanup_ticket = lck->lk.next_ticket;
+    }
+  }
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
+  ANNOTATE_DRDPA_ACQUIRED(lck);
+  return retval;
+}
+
+static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_set_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) == gtid)) {
+    KMP_FATAL(LockIsAlreadyOwned, func);
+  }
+
+  __kmp_acquire_drdpa_lock(lck, gtid);
+
+  lck->lk.owner_id = gtid + 1;
+  return KMP_LOCK_ACQUIRED_FIRST;
+}
+
+int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  // First get a ticket, then read the polls pointer and the mask.
+  // The polls pointer must be read before the mask!!! (See above)
+  kmp_uint64 ticket = lck->lk.next_ticket; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  if (polls[ticket & mask] == ticket) {
+    kmp_uint64 next_ticket = ticket + 1;
+    if (__kmp_atomic_compare_store_acq(&lck->lk.next_ticket, ticket,
+                                       next_ticket)) {
+      KMP_FSYNC_ACQUIRED(lck);
+      KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
+                      ticket, lck));
+      lck->lk.now_serving = ticket; // non-volatile store
+
+      // Since no threads are waiting, there is no possibility that we would
+      // want to reconfigure the polling area.  We might have the cleanup ticket
+      // value (which says that it is now safe to deallocate old_polls), but
+      // we'll let a later thread which calls __kmp_acquire_lock do that - this
+      // routine isn't supposed to block, and we would risk blocks if we called
+      // __kmp_free() to do the deallocation.
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                             kmp_int32 gtid) {
+  char const *const func = "omp_test_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+
+  int retval = __kmp_test_drdpa_lock(lck, gtid);
+
+  if (retval) {
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  // Read the ticket value from the lock data struct, then the polls pointer and
+  // the mask.  The polls pointer must be read before the mask!!! (See above)
+  kmp_uint64 ticket = lck->lk.now_serving + 1; // non-atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls; // atomic load
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
+                  ticket - 1, lck));
+  KMP_FSYNC_RELEASING(lck);
+  ANNOTATE_DRDPA_RELEASED(lck);
+  polls[ticket & mask] = ticket; // atomic store
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                kmp_int32 gtid) {
+  char const *const func = "omp_unset_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) >= 0) &&
+      (__kmp_get_drdpa_lock_owner(lck) != gtid)) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  lck->lk.owner_id = 0;
+  return __kmp_release_drdpa_lock(lck, gtid);
+}
+
+void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  lck->lk.location = NULL;
+  lck->lk.mask = 0;
+  lck->lk.num_polls = 1;
+  lck->lk.polls = (std::atomic<kmp_uint64> *)__kmp_allocate(
+      lck->lk.num_polls * sizeof(*(lck->lk.polls)));
+  lck->lk.cleanup_ticket = 0;
+  lck->lk.old_polls = NULL;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0; // no thread owns the lock.
+  lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
+  lck->lk.initialized = lck;
+
+  KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
+}
+
+void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  lck->lk.initialized = NULL;
+  lck->lk.location = NULL;
+  if (lck->lk.polls.load() != NULL) {
+    __kmp_free(lck->lk.polls.load());
+    lck->lk.polls = NULL;
+  }
+  if (lck->lk.old_polls != NULL) {
+    __kmp_free(lck->lk.old_polls);
+    lck->lk.old_polls = NULL;
+  }
+  lck->lk.mask = 0;
+  lck->lk.num_polls = 0;
+  lck->lk.cleanup_ticket = 0;
+  lck->lk.next_ticket = 0;
+  lck->lk.now_serving = 0;
+  lck->lk.owner_id = 0;
+  lck->lk.depth_locked = -1;
+}
+
+static void __kmp_destroy_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  char const *const func = "omp_destroy_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockNestableUsedAsSimple, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_drdpa_lock(lck);
+}
+
+// nested drdpa ticket locks
+
+int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
+    lck->lk.depth_locked += 1;
+    return KMP_LOCK_ACQUIRED_NEXT;
+  } else {
+    __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
+    ANNOTATE_DRDPA_ACQUIRED(lck);
+    KMP_MB();
+    lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+    return KMP_LOCK_ACQUIRED_FIRST;
+  }
+}
+
+static void __kmp_acquire_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                        kmp_int32 gtid) {
+  char const *const func = "omp_set_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  __kmp_acquire_nested_drdpa_lock(lck, gtid);
+}
+
+int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  int retval;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
+    retval = ++lck->lk.depth_locked;
+  } else if (!__kmp_test_drdpa_lock(lck, gtid)) {
+    retval = 0;
+  } else {
+    KMP_MB();
+    retval = lck->lk.depth_locked = 1;
+    KMP_MB();
+    lck->lk.owner_id = gtid + 1;
+  }
+  return retval;
+}
+
+static int __kmp_test_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                    kmp_int32 gtid) {
+  char const *const func = "omp_test_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  return __kmp_test_nested_drdpa_lock(lck, gtid);
+}
+
+int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  KMP_MB();
+  if (--(lck->lk.depth_locked) == 0) {
+    KMP_MB();
+    lck->lk.owner_id = 0;
+    __kmp_release_drdpa_lock(lck, gtid);
+    return KMP_LOCK_RELEASED;
+  }
+  return KMP_LOCK_STILL_HELD;
+}
+
+static int __kmp_release_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
+                                                       kmp_int32 gtid) {
+  char const *const func = "omp_unset_nest_lock";
+  KMP_MB(); /* in case another processor initialized lock */
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) == -1) {
+    KMP_FATAL(LockUnsettingFree, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) != gtid) {
+    KMP_FATAL(LockUnsettingSetByAnother, func);
+  }
+  return __kmp_release_nested_drdpa_lock(lck, gtid);
+}
+
+void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  __kmp_init_drdpa_lock(lck);
+  lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
+}
+
+void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
+  __kmp_destroy_drdpa_lock(lck);
+  lck->lk.depth_locked = 0;
+}
+
+static void __kmp_destroy_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  char const *const func = "omp_destroy_nest_lock";
+  if (lck->lk.initialized != lck) {
+    KMP_FATAL(LockIsUninitialized, func);
+  }
+  if (!__kmp_is_drdpa_lock_nestable(lck)) {
+    KMP_FATAL(LockSimpleUsedAsNestable, func);
+  }
+  if (__kmp_get_drdpa_lock_owner(lck) != -1) {
+    KMP_FATAL(LockStillOwned, func);
+  }
+  __kmp_destroy_nested_drdpa_lock(lck);
+}
+
+// access functions to fields which don't exist for all lock kinds.
+
+static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
+  return lck->lk.location;
+}
+
+static void __kmp_set_drdpa_lock_location(kmp_drdpa_lock_t *lck,
+                                          const ident_t *loc) {
+  lck->lk.location = loc;
+}
+
+static kmp_lock_flags_t __kmp_get_drdpa_lock_flags(kmp_drdpa_lock_t *lck) {
+  return lck->lk.flags;
+}
+
+static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
+                                       kmp_lock_flags_t flags) {
+  lck->lk.flags = flags;
+}
+
+// Time stamp counter
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#define __kmp_tsc() __kmp_hardware_timestamp()
+// Runtime's default backoff parameters
+kmp_backoff_t __kmp_spin_backoff_params = {1, 4096, 100};
+#else
+// Use nanoseconds for other platforms
+extern kmp_uint64 __kmp_now_nsec();
+kmp_backoff_t __kmp_spin_backoff_params = {1, 256, 100};
+#define __kmp_tsc() __kmp_now_nsec()
+#endif
+
+// A useful predicate for dealing with timestamps that may wrap.
+// Is a before b? Since the timestamps may wrap, this is asking whether it's
+// shorter to go clockwise from a to b around the clock-face, or anti-clockwise.
+// Times where going clockwise is less distance than going anti-clockwise
+// are in the future, others are in the past. e.g. a = MAX-1, b = MAX+1 (=0),
+// then a > b (true) does not mean a reached b; whereas signed(a) = -2,
+// signed(b) = 0 captures the actual difference
+static inline bool before(kmp_uint64 a, kmp_uint64 b) {
+  return ((kmp_int64)b - (kmp_int64)a) > 0;
+}
+
+// Truncated binary exponential backoff function
+void __kmp_spin_backoff(kmp_backoff_t *boff) {
+  // We could flatten this loop, but making it a nested loop gives better result
+  kmp_uint32 i;
+  for (i = boff->step; i > 0; i--) {
+    kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
+    do {
+      KMP_CPU_PAUSE();
+    } while (before(__kmp_tsc(), goal));
+  }
+  boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// Direct lock initializers. It simply writes a tag to the low 8 bits of the
+// lock word.
+static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
+                                   kmp_dyna_lockseq_t seq) {
+  TCW_4(*lck, KMP_GET_D_TAG(seq));
+  KA_TRACE(
+      20,
+      ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
+}
+
+#if KMP_USE_TSX
+
+// HLE lock functions - imported from the testbed runtime.
+#define HLE_ACQUIRE ".byte 0xf2;"
+#define HLE_RELEASE ".byte 0xf3;"
+
+static inline kmp_uint32 swap4(kmp_uint32 volatile *p, kmp_uint32 v) {
+  __asm__ volatile(HLE_ACQUIRE "xchg %1,%0" : "+r"(v), "+m"(*p) : : "memory");
+  return v;
+}
+
+static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
+
+static void __kmp_destroy_hle_lock_with_checks(kmp_dyna_lock_t *lck) {
+  TCW_4(*lck, 0);
+}
+
+static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
+  // Use gtid for KMP_LOCK_BUSY if necessary
+  if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
+    int delay = 1;
+    do {
+      while (*(kmp_uint32 volatile *)lck != KMP_LOCK_FREE(hle)) {
+        for (int i = delay; i != 0; --i)
+          KMP_CPU_PAUSE();
+        delay = ((delay << 1) | 1) & 7;
+      }
+    } while (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle));
+  }
+}
+
+static void __kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck,
+                                               kmp_int32 gtid) {
+  __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static int __kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
+  __asm__ volatile(HLE_RELEASE "movl %1,%0"
+                   : "=m"(*lck)
+                   : "r"(KMP_LOCK_FREE(hle))
+                   : "memory");
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck,
+                                              kmp_int32 gtid) {
+  return __kmp_release_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static int __kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
+  return swap4(lck, KMP_LOCK_BUSY(1, hle)) == KMP_LOCK_FREE(hle);
+}
+
+static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
+                                           kmp_int32 gtid) {
+  return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
+}
+
+static void __kmp_init_rtm_lock(kmp_queuing_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+}
+
+static void __kmp_destroy_rtm_lock(kmp_queuing_lock_t *lck) {
+  __kmp_destroy_queuing_lock(lck);
+}
+
+static void __kmp_destroy_rtm_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_destroy_queuing_lock_with_checks(lck);
+}
+
+static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED) {
+      if (__kmp_is_unlocked_queuing_lock(lck))
+        return;
+      _xabort(0xff);
+    }
+    if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
+      // Wait until lock becomes free
+      while (!__kmp_is_unlocked_queuing_lock(lck)) {
+        KMP_YIELD(TRUE);
+      }
+    } else if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  // Fall-back non-speculative lock (xchg)
+  __kmp_acquire_queuing_lock(lck, gtid);
+}
+
+static void __kmp_acquire_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
+                                               kmp_int32 gtid) {
+  __kmp_acquire_rtm_lock(lck, gtid);
+}
+
+static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  if (__kmp_is_unlocked_queuing_lock(lck)) {
+    // Releasing from speculation
+    _xend();
+  } else {
+    // Releasing from a real lock
+    __kmp_release_queuing_lock(lck, gtid);
+  }
+  return KMP_LOCK_RELEASED;
+}
+
+static int __kmp_release_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
+                                              kmp_int32 gtid) {
+  return __kmp_release_rtm_lock(lck, gtid);
+}
+
+static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
+  unsigned retries = 3, status;
+  do {
+    status = _xbegin();
+    if (status == _XBEGIN_STARTED && __kmp_is_unlocked_queuing_lock(lck)) {
+      return 1;
+    }
+    if (!(status & _XABORT_RETRY))
+      break;
+  } while (retries--);
+
+  return (__kmp_is_unlocked_queuing_lock(lck)) ? 1 : 0;
+}
+
+static int __kmp_test_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
+                                           kmp_int32 gtid) {
+  return __kmp_test_rtm_lock(lck, gtid);
+}
+
+#endif // KMP_USE_TSX
+
+// Entry functions for indirect locks (first element of direct lock jump tables)
+static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
+                                     kmp_dyna_lockseq_t tag);
+static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
+static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
+static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
+static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
+static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                               kmp_int32);
+static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                 kmp_int32);
+static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                kmp_int32);
+
+// Lock function definitions for the union parameter type
+#define KMP_FOREACH_LOCK_KIND(m, a) m(ticket, a) m(queuing, a) m(drdpa, a)
+
+#define expand1(lk, op)                                                        \
+  static void __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock) {               \
+    __kmp_##op##_##lk##_##lock(&lock->lk);                                     \
+  }
+#define expand2(lk, op)                                                        \
+  static int __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock,                  \
+                                        kmp_int32 gtid) {                      \
+    return __kmp_##op##_##lk##_##lock(&lock->lk, gtid);                        \
+  }
+#define expand3(lk, op)                                                        \
+  static void __kmp_set_##lk##_##lock_flags(kmp_user_lock_p lock,              \
+                                            kmp_lock_flags_t flags) {          \
+    __kmp_set_##lk##_lock_flags(&lock->lk, flags);                             \
+  }
+#define expand4(lk, op)                                                        \
+  static void __kmp_set_##lk##_##lock_location(kmp_user_lock_p lock,           \
+                                               const ident_t *loc) {           \
+    __kmp_set_##lk##_lock_location(&lock->lk, loc);                            \
+  }
+
+KMP_FOREACH_LOCK_KIND(expand1, init)
+KMP_FOREACH_LOCK_KIND(expand1, init_nested)
+KMP_FOREACH_LOCK_KIND(expand1, destroy)
+KMP_FOREACH_LOCK_KIND(expand1, destroy_nested)
+KMP_FOREACH_LOCK_KIND(expand2, acquire)
+KMP_FOREACH_LOCK_KIND(expand2, acquire_nested)
+KMP_FOREACH_LOCK_KIND(expand2, release)
+KMP_FOREACH_LOCK_KIND(expand2, release_nested)
+KMP_FOREACH_LOCK_KIND(expand2, test)
+KMP_FOREACH_LOCK_KIND(expand2, test_nested)
+KMP_FOREACH_LOCK_KIND(expand3, )
+KMP_FOREACH_LOCK_KIND(expand4, )
+
+#undef expand1
+#undef expand2
+#undef expand3
+#undef expand4
+
+// Jump tables for the indirect lock functions
+// Only fill in the odd entries, that avoids the need to shift out the low bit
+
+// init functions
+#define expand(l, op) 0, __kmp_init_direct_lock,
+void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t) = {
+    __kmp_init_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, init)};
+#undef expand
+
+// destroy functions
+#define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
+static void (*direct_destroy[])(kmp_dyna_lock_t *) = {
+    __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
+#undef expand
+#define expand(l, op)                                                          \
+  0, (void (*)(kmp_dyna_lock_t *))__kmp_destroy_##l##_lock_with_checks,
+static void (*direct_destroy_check[])(kmp_dyna_lock_t *) = {
+    __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
+#undef expand
+
+// set/acquire functions
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
+static int (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
+#undef expand
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
+static int (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_set_indirect_lock_with_checks, 0,
+    KMP_FOREACH_D_LOCK(expand, acquire)};
+#undef expand
+
+// unset/release and test functions
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
+static int (*direct_unset[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_unset_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, release)};
+static int (*direct_test[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_test_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, test)};
+#undef expand
+#define expand(l, op)                                                          \
+  0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
+static int (*direct_unset_check[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_unset_indirect_lock_with_checks, 0,
+    KMP_FOREACH_D_LOCK(expand, release)};
+static int (*direct_test_check[])(kmp_dyna_lock_t *, kmp_int32) = {
+    __kmp_test_indirect_lock_with_checks, 0, KMP_FOREACH_D_LOCK(expand, test)};
+#undef expand
+
+// Exposes only one set of jump tables (*lock or *lock_with_checks).
+void (*(*__kmp_direct_destroy))(kmp_dyna_lock_t *) = 0;
+int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32) = 0;
+
+// Jump tables for the indirect lock functions
+#define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
+void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
+    KMP_FOREACH_I_LOCK(expand, init)};
+#undef expand
+
+#define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
+static void (*indirect_destroy[])(kmp_user_lock_p) = {
+    KMP_FOREACH_I_LOCK(expand, destroy)};
+#undef expand
+#define expand(l, op)                                                          \
+  (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock_with_checks,
+static void (*indirect_destroy_check[])(kmp_user_lock_p) = {
+    KMP_FOREACH_I_LOCK(expand, destroy)};
+#undef expand
+
+// set/acquire functions
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
+static int (*indirect_set[])(kmp_user_lock_p,
+                             kmp_int32) = {KMP_FOREACH_I_LOCK(expand, acquire)};
+#undef expand
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
+static int (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, acquire)};
+#undef expand
+
+// unset/release and test functions
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
+static int (*indirect_unset[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, release)};
+static int (*indirect_test[])(kmp_user_lock_p,
+                              kmp_int32) = {KMP_FOREACH_I_LOCK(expand, test)};
+#undef expand
+#define expand(l, op)                                                          \
+  (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
+static int (*indirect_unset_check[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, release)};
+static int (*indirect_test_check[])(kmp_user_lock_p, kmp_int32) = {
+    KMP_FOREACH_I_LOCK(expand, test)};
+#undef expand
+
+// Exposes only one jump tables (*lock or *lock_with_checks).
+void (*(*__kmp_indirect_destroy))(kmp_user_lock_p) = 0;
+int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
+int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32) = 0;
+int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32) = 0;
+
+// Lock index table.
+kmp_indirect_lock_table_t __kmp_i_lock_table;
+
+// Size of indirect locks.
+static kmp_uint32 __kmp_indirect_lock_size[KMP_NUM_I_LOCKS] = {0};
+
+// Jump tables for lock accessor/modifier.
+void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                     const ident_t *) = {0};
+void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                  kmp_lock_flags_t) = {0};
+const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p) = {0};
+kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p) = {0};
+
+// Use different lock pools for different lock types.
+static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
+
+// User lock allocator for dynamically dispatched indirect locks. Every entry of
+// the indirect lock table holds the address and type of the allocated indrect
+// lock (kmp_indirect_lock_t), and the size of the table doubles when it is
+// full. A destroyed indirect lock object is returned to the reusable pool of
+// locks, unique to each lock type.
+kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
+                                                  kmp_int32 gtid,
+                                                  kmp_indirect_locktag_t tag) {
+  kmp_indirect_lock_t *lck;
+  kmp_lock_index_t idx;
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  if (__kmp_indirect_lock_pool[tag] != NULL) {
+    // Reuse the allocated and destroyed lock object
+    lck = __kmp_indirect_lock_pool[tag];
+    if (OMP_LOCK_T_SIZE < sizeof(void *))
+      idx = lck->lock->pool.index;
+    __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
+    KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
+                  lck));
+  } else {
+    idx = __kmp_i_lock_table.next;
+    // Check capacity and double the size if it is full
+    if (idx == __kmp_i_lock_table.size) {
+      // Double up the space for block pointers
+      int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK;
+      kmp_indirect_lock_t **new_table = (kmp_indirect_lock_t **)__kmp_allocate(
+          2 * row * sizeof(kmp_indirect_lock_t *));
+      KMP_MEMCPY(new_table, __kmp_i_lock_table.table,
+                 row * sizeof(kmp_indirect_lock_t *));
+      kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table;
+      __kmp_i_lock_table.table = new_table;
+      __kmp_free(old_table);
+      // Allocate new objects in the new blocks
+      for (int i = row; i < 2 * row; ++i)
+        *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate(
+            KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
+      __kmp_i_lock_table.size = 2 * idx;
+    }
+    __kmp_i_lock_table.next++;
+    lck = KMP_GET_I_LOCK(idx);
+    // Allocate a new base lock object
+    lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
+    KA_TRACE(20,
+             ("__kmp_allocate_indirect_lock: allocated a new lock %p\n", lck));
+  }
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+
+  lck->type = tag;
+
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    *((kmp_lock_index_t *)user_lock) = idx
+                                       << 1; // indirect lock word must be even
+  } else {
+    *((kmp_indirect_lock_t **)user_lock) = lck;
+  }
+
+  return lck;
+}
+
+// User lock lookup for dynamically dispatched locks.
+static __forceinline kmp_indirect_lock_t *
+__kmp_lookup_indirect_lock(void **user_lock, const char *func) {
+  if (__kmp_env_consistency_check) {
+    kmp_indirect_lock_t *lck = NULL;
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+    if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+      kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
+      if (idx >= __kmp_i_lock_table.size) {
+        KMP_FATAL(LockIsUninitialized, func);
+      }
+      lck = KMP_GET_I_LOCK(idx);
+    } else {
+      lck = *((kmp_indirect_lock_t **)user_lock);
+    }
+    if (lck == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+    return lck;
+  } else {
+    if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+      return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock));
+    } else {
+      return *((kmp_indirect_lock_t **)user_lock);
+    }
+  }
+}
+
+static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
+                                     kmp_dyna_lockseq_t seq) {
+#if KMP_USE_ADAPTIVE_LOCKS
+  if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
+    KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
+    seq = lockseq_queuing;
+  }
+#endif
+#if KMP_USE_TSX
+  if (seq == lockseq_rtm && !__kmp_cpuinfo.rtm) {
+    seq = lockseq_queuing;
+  }
+#endif
+  kmp_indirect_locktag_t tag = KMP_GET_I_TAG(seq);
+  kmp_indirect_lock_t *l =
+      __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
+  KMP_I_LOCK_FUNC(l, init)(l->lock);
+  KA_TRACE(
+      20, ("__kmp_init_indirect_lock: initialized indirect lock with type#%d\n",
+           seq));
+}
+
+static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
+  kmp_uint32 gtid = __kmp_entry_gtid();
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
+  KMP_I_LOCK_FUNC(l, destroy)(l->lock);
+  kmp_indirect_locktag_t tag = l->type;
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  // Use the base lock's space to keep the pool chain.
+  l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    l->lock->pool.index = KMP_EXTRACT_I_INDEX(lock);
+  }
+  __kmp_indirect_lock_pool[tag] = l;
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+}
+
+static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
+  kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
+  return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
+}
+
+static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
+  kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
+  return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
+}
+
+static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
+  kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
+  return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
+}
+
+static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                               kmp_int32 gtid) {
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
+  return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
+}
+
+static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                 kmp_int32 gtid) {
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
+  return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
+}
+
+static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
+                                                kmp_int32 gtid) {
+  kmp_indirect_lock_t *l =
+      __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
+  return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
+}
+
+kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
+
+// This is used only in kmp_error.cpp when consistency checking is on.
+kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq) {
+  switch (seq) {
+  case lockseq_tas:
+  case lockseq_nested_tas:
+    return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
+#if KMP_USE_FUTEX
+  case lockseq_futex:
+  case lockseq_nested_futex:
+    return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
+#endif
+  case lockseq_ticket:
+  case lockseq_nested_ticket:
+    return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
+  case lockseq_queuing:
+  case lockseq_nested_queuing:
+#if KMP_USE_ADAPTIVE_LOCKS
+  case lockseq_adaptive:
+#endif
+    return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
+  case lockseq_drdpa:
+  case lockseq_nested_drdpa:
+    return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
+  default:
+    return 0;
+  }
+}
+
+// Initializes data for dynamic user locks.
+void __kmp_init_dynamic_user_locks() {
+  // Initialize jump table for the lock functions
+  if (__kmp_env_consistency_check) {
+    __kmp_direct_set = direct_set_check;
+    __kmp_direct_unset = direct_unset_check;
+    __kmp_direct_test = direct_test_check;
+    __kmp_direct_destroy = direct_destroy_check;
+    __kmp_indirect_set = indirect_set_check;
+    __kmp_indirect_unset = indirect_unset_check;
+    __kmp_indirect_test = indirect_test_check;
+    __kmp_indirect_destroy = indirect_destroy_check;
+  } else {
+    __kmp_direct_set = direct_set;
+    __kmp_direct_unset = direct_unset;
+    __kmp_direct_test = direct_test;
+    __kmp_direct_destroy = direct_destroy;
+    __kmp_indirect_set = indirect_set;
+    __kmp_indirect_unset = indirect_unset;
+    __kmp_indirect_test = indirect_test;
+    __kmp_indirect_destroy = indirect_destroy;
+  }
+  // If the user locks have already been initialized, then return. Allow the
+  // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
+  // new lock tables if they have already been allocated.
+  if (__kmp_init_user_locks)
+    return;
+
+  // Initialize lock index table
+  __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK;
+  __kmp_i_lock_table.table =
+      (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *));
+  *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
+      KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
+  __kmp_i_lock_table.next = 0;
+
+  // Indirect lock size
+  __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
+  __kmp_indirect_lock_size[locktag_queuing] = sizeof(kmp_queuing_lock_t);
+#if KMP_USE_ADAPTIVE_LOCKS
+  __kmp_indirect_lock_size[locktag_adaptive] = sizeof(kmp_adaptive_lock_t);
+#endif
+  __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
+#if KMP_USE_TSX
+  __kmp_indirect_lock_size[locktag_rtm] = sizeof(kmp_queuing_lock_t);
+#endif
+  __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
+#if KMP_USE_FUTEX
+  __kmp_indirect_lock_size[locktag_nested_futex] = sizeof(kmp_futex_lock_t);
+#endif
+  __kmp_indirect_lock_size[locktag_nested_ticket] = sizeof(kmp_ticket_lock_t);
+  __kmp_indirect_lock_size[locktag_nested_queuing] = sizeof(kmp_queuing_lock_t);
+  __kmp_indirect_lock_size[locktag_nested_drdpa] = sizeof(kmp_drdpa_lock_t);
+
+// Initialize lock accessor/modifier
+#define fill_jumps(table, expand, sep)                                         \
+  {                                                                            \
+    table[locktag##sep##ticket] = expand(ticket);                              \
+    table[locktag##sep##queuing] = expand(queuing);                            \
+    table[locktag##sep##drdpa] = expand(drdpa);                                \
+  }
+
+#if KMP_USE_ADAPTIVE_LOCKS
+#define fill_table(table, expand)                                              \
+  {                                                                            \
+    fill_jumps(table, expand, _);                                              \
+    table[locktag_adaptive] = expand(queuing);                                 \
+    fill_jumps(table, expand, _nested_);                                       \
+  }
+#else
+#define fill_table(table, expand)                                              \
+  {                                                                            \
+    fill_jumps(table, expand, _);                                              \
+    fill_jumps(table, expand, _nested_);                                       \
+  }
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+#define expand(l)                                                              \
+  (void (*)(kmp_user_lock_p, const ident_t *)) __kmp_set_##l##_lock_location
+  fill_table(__kmp_indirect_set_location, expand);
+#undef expand
+#define expand(l)                                                              \
+  (void (*)(kmp_user_lock_p, kmp_lock_flags_t)) __kmp_set_##l##_lock_flags
+  fill_table(__kmp_indirect_set_flags, expand);
+#undef expand
+#define expand(l)                                                              \
+  (const ident_t *(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_location
+  fill_table(__kmp_indirect_get_location, expand);
+#undef expand
+#define expand(l)                                                              \
+  (kmp_lock_flags_t(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_flags
+  fill_table(__kmp_indirect_get_flags, expand);
+#undef expand
+
+  __kmp_init_user_locks = TRUE;
+}
+
+// Clean up the lock table.
+void __kmp_cleanup_indirect_user_locks() {
+  kmp_lock_index_t i;
+  int k;
+
+  // Clean up locks in the pools first (they were already destroyed before going
+  // into the pools).
+  for (k = 0; k < KMP_NUM_I_LOCKS; ++k) {
+    kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
+    while (l != NULL) {
+      kmp_indirect_lock_t *ll = l;
+      l = (kmp_indirect_lock_t *)l->lock->pool.next;
+      KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: freeing %p from pool\n",
+                    ll));
+      __kmp_free(ll->lock);
+      ll->lock = NULL;
+    }
+    __kmp_indirect_lock_pool[k] = NULL;
+  }
+  // Clean up the remaining undestroyed locks.
+  for (i = 0; i < __kmp_i_lock_table.next; i++) {
+    kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i);
+    if (l->lock != NULL) {
+      // Locks not destroyed explicitly need to be destroyed here.
+      KMP_I_LOCK_FUNC(l, destroy)(l->lock);
+      KA_TRACE(
+          20,
+          ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n",
+           l));
+      __kmp_free(l->lock);
+    }
+  }
+  // Free the table
+  for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++)
+    __kmp_free(__kmp_i_lock_table.table[i]);
+  __kmp_free(__kmp_i_lock_table.table);
+
+  __kmp_init_user_locks = FALSE;
+}
+
+enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
+int __kmp_num_locks_in_block = 1; // FIXME - tune this value
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  __kmp_init_tas_lock(lck);
+}
+
+static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
+  __kmp_init_nested_tas_lock(lck);
+}
+
+#if KMP_USE_FUTEX
+static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  __kmp_init_futex_lock(lck);
+}
+
+static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
+  __kmp_init_nested_futex_lock(lck);
+}
+#endif
+
+static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
+  return lck == lck->lk.self;
+}
+
+static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+}
+
+static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
+  __kmp_init_nested_ticket_lock(lck);
+}
+
+static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
+  return lck == lck->lk.initialized;
+}
+
+static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_init_queuing_lock(lck);
+}
+
+static void
+__kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
+  __kmp_init_nested_queuing_lock(lck);
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
+  __kmp_init_adaptive_lock(lck);
+}
+#endif
+
+static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
+  return lck == lck->lk.initialized;
+}
+
+static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  __kmp_init_drdpa_lock(lck);
+}
+
+static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
+  __kmp_init_nested_drdpa_lock(lck);
+}
+
+/* user locks
+ * They are implemented as a table of function pointers which are set to the
+ * lock functions of the appropriate kind, once that has been determined. */
+
+enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
+
+size_t __kmp_base_user_lock_size = 0;
+size_t __kmp_user_lock_size = 0;
+
+kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck) = NULL;
+int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                            kmp_int32 gtid) = NULL;
+
+int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                         kmp_int32 gtid) = NULL;
+int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                            kmp_int32 gtid) = NULL;
+void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) = NULL;
+
+int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                kmp_int32 gtid) = NULL;
+int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) = NULL;
+void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
+
+int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck) = NULL;
+const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
+                                      const ident_t *loc) = NULL;
+kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck) = NULL;
+void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
+                                   kmp_lock_flags_t flags) = NULL;
+
+void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind) {
+  switch (user_lock_kind) {
+  case lk_default:
+  default:
+    KMP_ASSERT(0);
+
+  case lk_tas: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_tas_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_tas_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_tas_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
+    } else {
+      KMP_BIND_USER_LOCK(tas);
+      KMP_BIND_NESTED_USER_LOCK(tas);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_tas_lock);
+
+    __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
+
+    __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_location_ =
+        (void (*)(kmp_user_lock_p, const ident_t *))NULL;
+
+    __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_flags_ =
+        (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
+  } break;
+
+#if KMP_USE_FUTEX
+
+  case lk_futex: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_futex_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_futex_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_futex_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
+    } else {
+      KMP_BIND_USER_LOCK(futex);
+      KMP_BIND_NESTED_USER_LOCK(futex);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_futex_lock);
+
+    __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
+
+    __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_location_ =
+        (void (*)(kmp_user_lock_p, const ident_t *))NULL;
+
+    __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
+
+    __kmp_set_user_lock_flags_ =
+        (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
+  } break;
+
+#endif // KMP_USE_FUTEX
+
+  case lk_ticket: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_ticket_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_ticket_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
+    } else {
+      KMP_BIND_USER_LOCK(ticket);
+      KMP_BIND_NESTED_USER_LOCK(ticket);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_ticket_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_ticket_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_ticket_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_ticket_lock_flags);
+  } break;
+
+  case lk_queuing: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_queuing_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_queuing_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
+    } else {
+      KMP_BIND_USER_LOCK(queuing);
+      KMP_BIND_NESTED_USER_LOCK(queuing);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_queuing_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_queuing_lock_flags);
+  } break;
+
+#if KMP_USE_ADAPTIVE_LOCKS
+  case lk_adaptive: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_adaptive_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_adaptive_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
+    } else {
+      KMP_BIND_USER_LOCK(adaptive);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_adaptive_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_queuing_lock_flags);
+
+  } break;
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+  case lk_drdpa: {
+    __kmp_base_user_lock_size = sizeof(kmp_base_drdpa_lock_t);
+    __kmp_user_lock_size = sizeof(kmp_drdpa_lock_t);
+
+    __kmp_get_user_lock_owner_ =
+        (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_owner);
+
+    if (__kmp_env_consistency_check) {
+      KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
+      KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
+    } else {
+      KMP_BIND_USER_LOCK(drdpa);
+      KMP_BIND_NESTED_USER_LOCK(drdpa);
+    }
+
+    __kmp_destroy_user_lock_ =
+        (void (*)(kmp_user_lock_p))(&__kmp_destroy_drdpa_lock);
+
+    __kmp_is_user_lock_initialized_ =
+        (int (*)(kmp_user_lock_p))(&__kmp_is_drdpa_lock_initialized);
+
+    __kmp_get_user_lock_location_ =
+        (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_location);
+
+    __kmp_set_user_lock_location_ = (void (*)(
+        kmp_user_lock_p, const ident_t *))(&__kmp_set_drdpa_lock_location);
+
+    __kmp_get_user_lock_flags_ =
+        (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_flags);
+
+    __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
+        &__kmp_set_drdpa_lock_flags);
+  } break;
+  }
+}
+
+// ----------------------------------------------------------------------------
+// User lock table & lock allocation
+
+kmp_lock_table_t __kmp_user_lock_table = {1, 0, NULL};
+kmp_user_lock_p __kmp_lock_pool = NULL;
+
+// Lock block-allocation support.
+kmp_block_of_locks *__kmp_lock_blocks = NULL;
+int __kmp_num_locks_in_block = 1; // FIXME - tune this value
+
+static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
+  // Assume that kmp_global_lock is held upon entry/exit.
+  kmp_lock_index_t index;
+  if (__kmp_user_lock_table.used >= __kmp_user_lock_table.allocated) {
+    kmp_lock_index_t size;
+    kmp_user_lock_p *table;
+    // Reallocate lock table.
+    if (__kmp_user_lock_table.allocated == 0) {
+      size = 1024;
+    } else {
+      size = __kmp_user_lock_table.allocated * 2;
+    }
+    table = (kmp_user_lock_p *)__kmp_allocate(sizeof(kmp_user_lock_p) * size);
+    KMP_MEMCPY(table + 1, __kmp_user_lock_table.table + 1,
+               sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
+    table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
+    // We cannot free the previous table now, since it may be in use by other
+    // threads. So save the pointer to the previous table in in the first
+    // element of the new table. All the tables will be organized into a list,
+    // and could be freed when library shutting down.
+    __kmp_user_lock_table.table = table;
+    __kmp_user_lock_table.allocated = size;
+  }
+  KMP_DEBUG_ASSERT(__kmp_user_lock_table.used <
+                   __kmp_user_lock_table.allocated);
+  index = __kmp_user_lock_table.used;
+  __kmp_user_lock_table.table[index] = lck;
+  ++__kmp_user_lock_table.used;
+  return index;
+}
+
+static kmp_user_lock_p __kmp_lock_block_allocate() {
+  // Assume that kmp_global_lock is held upon entry/exit.
+  static int last_index = 0;
+  if ((last_index >= __kmp_num_locks_in_block) || (__kmp_lock_blocks == NULL)) {
+    // Restart the index.
+    last_index = 0;
+    // Need to allocate a new block.
+    KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
+    size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
+    char *buffer =
+        (char *)__kmp_allocate(space_for_locks + sizeof(kmp_block_of_locks));
+    // Set up the new block.
+    kmp_block_of_locks *new_block =
+        (kmp_block_of_locks *)(&buffer[space_for_locks]);
+    new_block->next_block = __kmp_lock_blocks;
+    new_block->locks = (void *)buffer;
+    // Publish the new block.
+    KMP_MB();
+    __kmp_lock_blocks = new_block;
+  }
+  kmp_user_lock_p ret = (kmp_user_lock_p)(&(
+      ((char *)(__kmp_lock_blocks->locks))[last_index * __kmp_user_lock_size]));
+  last_index++;
+  return ret;
+}
+
+// Get memory for a lock. It may be freshly allocated memory or reused memory
+// from lock pool.
+kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
+                                         kmp_lock_flags_t flags) {
+  kmp_user_lock_p lck;
+  kmp_lock_index_t index;
+  KMP_DEBUG_ASSERT(user_lock);
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  if (__kmp_lock_pool == NULL) {
+    // Lock pool is empty. Allocate new memory.
+
+    // ANNOTATION: Found no good way to express the syncronisation
+    // between allocation and usage, so ignore the allocation
+    ANNOTATE_IGNORE_WRITES_BEGIN();
+    if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
+      lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
+    } else {
+      lck = __kmp_lock_block_allocate();
+    }
+    ANNOTATE_IGNORE_WRITES_END();
+
+    // Insert lock in the table so that it can be freed in __kmp_cleanup,
+    // and debugger has info on all allocated locks.
+    index = __kmp_lock_table_insert(lck);
+  } else {
+    // Pick up lock from pool.
+    lck = __kmp_lock_pool;
+    index = __kmp_lock_pool->pool.index;
+    __kmp_lock_pool = __kmp_lock_pool->pool.next;
+  }
+
+  // We could potentially differentiate between nested and regular locks
+  // here, and do the lock table lookup for regular locks only.
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    *((kmp_lock_index_t *)user_lock) = index;
+  } else {
+    *((kmp_user_lock_p *)user_lock) = lck;
+  }
+
+  // mark the lock if it is critical section lock.
+  __kmp_set_user_lock_flags(lck, flags);
+
+  __kmp_release_lock(&__kmp_global_lock, gtid); // AC: TODO move this line upper
+
+  return lck;
+}
+
+// Put lock's memory to pool for reusing.
+void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
+                          kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(user_lock != NULL);
+  KMP_DEBUG_ASSERT(lck != NULL);
+
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  lck->pool.next = __kmp_lock_pool;
+  __kmp_lock_pool = lck;
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
+    KMP_DEBUG_ASSERT(0 < index && index <= __kmp_user_lock_table.used);
+    lck->pool.index = index;
+  }
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+}
+
+kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock, char const *func) {
+  kmp_user_lock_p lck = NULL;
+
+  if (__kmp_env_consistency_check) {
+    if (user_lock == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  if (OMP_LOCK_T_SIZE < sizeof(void *)) {
+    kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
+    if (__kmp_env_consistency_check) {
+      if (!(0 < index && index < __kmp_user_lock_table.used)) {
+        KMP_FATAL(LockIsUninitialized, func);
+      }
+    }
+    KMP_DEBUG_ASSERT(0 < index && index < __kmp_user_lock_table.used);
+    KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
+    lck = __kmp_user_lock_table.table[index];
+  } else {
+    lck = *((kmp_user_lock_p *)user_lock);
+  }
+
+  if (__kmp_env_consistency_check) {
+    if (lck == NULL) {
+      KMP_FATAL(LockIsUninitialized, func);
+    }
+  }
+
+  return lck;
+}
+
+void __kmp_cleanup_user_locks(void) {
+  // Reset lock pool. Don't worry about lock in the pool--we will free them when
+  // iterating through lock table (it includes all the locks, dead or alive).
+  __kmp_lock_pool = NULL;
+
+#define IS_CRITICAL(lck)                                                       \
+  ((__kmp_get_user_lock_flags_ != NULL) &&                                     \
+   ((*__kmp_get_user_lock_flags_)(lck)&kmp_lf_critical_section))
+
+  // Loop through lock table, free all locks.
+  // Do not free item [0], it is reserved for lock tables list.
+  //
+  // FIXME - we are iterating through a list of (pointers to) objects of type
+  // union kmp_user_lock, but we have no way of knowing whether the base type is
+  // currently "pool" or whatever the global user lock type is.
+  //
+  // We are relying on the fact that for all of the user lock types
+  // (except "tas"), the first field in the lock struct is the "initialized"
+  // field, which is set to the address of the lock object itself when
+  // the lock is initialized.  When the union is of type "pool", the
+  // first field is a pointer to the next object in the free list, which
+  // will not be the same address as the object itself.
+  //
+  // This means that the check (*__kmp_is_user_lock_initialized_)(lck) will fail
+  // for "pool" objects on the free list.  This must happen as the "location"
+  // field of real user locks overlaps the "index" field of "pool" objects.
+  //
+  // It would be better to run through the free list, and remove all "pool"
+  // objects from the lock table before executing this loop.  However,
+  // "pool" objects do not always have their index field set (only on
+  // lin_32e), and I don't want to search the lock table for the address
+  // of every "pool" object on the free list.
+  while (__kmp_user_lock_table.used > 1) {
+    const ident *loc;
+
+    // reduce __kmp_user_lock_table.used before freeing the lock,
+    // so that state of locks is consistent
+    kmp_user_lock_p lck =
+        __kmp_user_lock_table.table[--__kmp_user_lock_table.used];
+
+    if ((__kmp_is_user_lock_initialized_ != NULL) &&
+        (*__kmp_is_user_lock_initialized_)(lck)) {
+      // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is initialized AND
+      // it is NOT a critical section (user is not responsible for destroying
+      // criticals) AND we know source location to report.
+      if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
+          ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
+          (loc->psource != NULL)) {
+        kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 0);
+        KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
+        __kmp_str_loc_free(&str_loc);
+      }
+
+#ifdef KMP_DEBUG
+      if (IS_CRITICAL(lck)) {
+        KA_TRACE(
+            20,
+            ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n",
+             lck, *(void **)lck));
+      } else {
+        KA_TRACE(20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck,
+                      *(void **)lck));
+      }
+#endif // KMP_DEBUG
+
+      // Cleanup internal lock dynamic resources (for drdpa locks particularly).
+      __kmp_destroy_user_lock(lck);
+    }
+
+    // Free the lock if block allocation of locks is not used.
+    if (__kmp_lock_blocks == NULL) {
+      __kmp_free(lck);
+    }
+  }
+
+#undef IS_CRITICAL
+
+  // delete lock table(s).
+  kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
+  __kmp_user_lock_table.table = NULL;
+  __kmp_user_lock_table.allocated = 0;
+
+  while (table_ptr != NULL) {
+    // In the first element we saved the pointer to the previous
+    // (smaller) lock table.
+    kmp_user_lock_p *next = (kmp_user_lock_p *)(table_ptr[0]);
+    __kmp_free(table_ptr);
+    table_ptr = next;
+  }
+
+  // Free buffers allocated for blocks of locks.
+  kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
+  __kmp_lock_blocks = NULL;
+
+  while (block_ptr != NULL) {
+    kmp_block_of_locks_t *next = block_ptr->next_block;
+    __kmp_free(block_ptr->locks);
+    // *block_ptr itself was allocated at the end of the locks vector.
+    block_ptr = next;
+  }
+
+  TCW_4(__kmp_init_user_locks, FALSE);
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
diff --git a/final/runtime/src/kmp_lock.h b/final/runtime/src/kmp_lock.h
new file mode 100644
index 0000000..ccd84eb
--- /dev/null
+++ b/final/runtime/src/kmp_lock.h
@@ -0,0 +1,1275 @@
+/*
+ * kmp_lock.h -- lock header file
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_LOCK_H
+#define KMP_LOCK_H
+
+#include <limits.h> // CHAR_BIT
+#include <stddef.h> // offsetof
+
+#include "kmp_debug.h"
+#include "kmp_os.h"
+
+#ifdef __cplusplus
+#include <atomic>
+
+extern "C" {
+#endif // __cplusplus
+
+// ----------------------------------------------------------------------------
+// Have to copy these definitions from kmp.h because kmp.h cannot be included
+// due to circular dependencies.  Will undef these at end of file.
+
+#define KMP_PAD(type, sz)                                                      \
+  (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
+#define KMP_GTID_DNE (-2)
+
+// Forward declaration of ident and ident_t
+
+struct ident;
+typedef struct ident ident_t;
+
+// End of copied code.
+// ----------------------------------------------------------------------------
+
+// We need to know the size of the area we can assume that the compiler(s)
+// allocated for obects of type omp_lock_t and omp_nest_lock_t.  The Intel
+// compiler always allocates a pointer-sized area, as does visual studio.
+//
+// gcc however, only allocates 4 bytes for regular locks, even on 64-bit
+// intel archs.  It allocates at least 8 bytes for nested lock (more on
+// recent versions), but we are bounded by the pointer-sized chunks that
+// the Intel compiler allocates.
+
+#if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT)
+#define OMP_LOCK_T_SIZE sizeof(int)
+#define OMP_NEST_LOCK_T_SIZE sizeof(void *)
+#else
+#define OMP_LOCK_T_SIZE sizeof(void *)
+#define OMP_NEST_LOCK_T_SIZE sizeof(void *)
+#endif
+
+// The Intel compiler allocates a 32-byte chunk for a critical section.
+// Both gcc and visual studio only allocate enough space for a pointer.
+// Sometimes we know that the space was allocated by the Intel compiler.
+#define OMP_CRITICAL_SIZE sizeof(void *)
+#define INTEL_CRITICAL_SIZE 32
+
+// lock flags
+typedef kmp_uint32 kmp_lock_flags_t;
+
+#define kmp_lf_critical_section 1
+
+// When a lock table is used, the indices are of kmp_lock_index_t
+typedef kmp_uint32 kmp_lock_index_t;
+
+// When memory allocated for locks are on the lock pool (free list),
+// it is treated as structs of this type.
+struct kmp_lock_pool {
+  union kmp_user_lock *next;
+  kmp_lock_index_t index;
+};
+
+typedef struct kmp_lock_pool kmp_lock_pool_t;
+
+extern void __kmp_validate_locks(void);
+
+// ----------------------------------------------------------------------------
+//  There are 5 lock implementations:
+//       1. Test and set locks.
+//       2. futex locks (Linux* OS on x86 and
+//          Intel(R) Many Integrated Core Architecture)
+//       3. Ticket (Lamport bakery) locks.
+//       4. Queuing locks (with separate spin fields).
+//       5. DRPA (Dynamically Reconfigurable Distributed Polling Area) locks
+//
+//   and 3 lock purposes:
+//       1. Bootstrap locks -- Used for a few locks available at library
+//       startup-shutdown time.
+//          These do not require non-negative global thread ID's.
+//       2. Internal RTL locks -- Used everywhere else in the RTL
+//       3. User locks (includes critical sections)
+// ----------------------------------------------------------------------------
+
+// ============================================================================
+// Lock implementations.
+//
+// Test and set locks.
+//
+// Non-nested test and set locks differ from the other lock kinds (except
+// futex) in that we use the memory allocated by the compiler for the lock,
+// rather than a pointer to it.
+//
+// On lin32, lin_32e, and win_32, the space allocated may be as small as 4
+// bytes, so we have to use a lock table for nested locks, and avoid accessing
+// the depth_locked field for non-nested locks.
+//
+// Information normally available to the tools, such as lock location, lock
+// usage (normal lock vs. critical section), etc. is not available with test and
+// set locks.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_tas_lock {
+  // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
+  std::atomic<kmp_int32> poll;
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+};
+
+typedef struct kmp_base_tas_lock kmp_base_tas_lock_t;
+
+union kmp_tas_lock {
+  kmp_base_tas_lock_t lk;
+  kmp_lock_pool_t pool; // make certain struct is large enough
+  double lk_align; // use worst case alignment; no cache line padding
+};
+
+typedef union kmp_tas_lock kmp_tas_lock_t;
+
+// Static initializer for test and set lock variables. Usage:
+//    kmp_tas_lock_t xlock = KMP_TAS_LOCK_INITIALIZER( xlock );
+#define KMP_TAS_LOCK_INITIALIZER(lock)                                         \
+  {                                                                            \
+    { ATOMIC_VAR_INIT(KMP_LOCK_FREE(tas)), 0 }                                 \
+  }
+
+extern int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_tas_lock(kmp_tas_lock_t *lck);
+extern void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck);
+
+extern int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck);
+extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck);
+
+#define KMP_LOCK_RELEASED 1
+#define KMP_LOCK_STILL_HELD 0
+#define KMP_LOCK_ACQUIRED_FIRST 1
+#define KMP_LOCK_ACQUIRED_NEXT 0
+#ifndef KMP_USE_FUTEX
+#define KMP_USE_FUTEX                                                          \
+  (KMP_OS_LINUX && !KMP_OS_CNK &&                                              \
+   (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64))
+#endif
+#if KMP_USE_FUTEX
+
+// ----------------------------------------------------------------------------
+// futex locks.  futex locks are only available on Linux* OS.
+//
+// Like non-nested test and set lock, non-nested futex locks use the memory
+// allocated by the compiler for the lock, rather than a pointer to it.
+//
+// Information normally available to the tools, such as lock location, lock
+// usage (normal lock vs. critical section), etc. is not available with test and
+// set locks. With non-nested futex locks, the lock owner is not even available.
+// ----------------------------------------------------------------------------
+
+struct kmp_base_futex_lock {
+  volatile kmp_int32 poll; // KMP_LOCK_FREE(futex) => unlocked
+  // 2*(gtid+1) of owning thread, 0 if unlocked
+  // locked: (gtid+1) of owning thread
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+};
+
+typedef struct kmp_base_futex_lock kmp_base_futex_lock_t;
+
+union kmp_futex_lock {
+  kmp_base_futex_lock_t lk;
+  kmp_lock_pool_t pool; // make certain struct is large enough
+  double lk_align; // use worst case alignment
+  // no cache line padding
+};
+
+typedef union kmp_futex_lock kmp_futex_lock_t;
+
+// Static initializer for futex lock variables. Usage:
+//    kmp_futex_lock_t xlock = KMP_FUTEX_LOCK_INITIALIZER( xlock );
+#define KMP_FUTEX_LOCK_INITIALIZER(lock)                                       \
+  {                                                                            \
+    { KMP_LOCK_FREE(futex), 0 }                                                \
+  }
+
+extern int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_futex_lock(kmp_futex_lock_t *lck);
+extern void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck);
+
+extern int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck,
+                                           kmp_int32 gtid);
+extern int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck,
+                                           kmp_int32 gtid);
+extern void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck);
+extern void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck);
+
+#endif // KMP_USE_FUTEX
+
+// ----------------------------------------------------------------------------
+// Ticket locks.
+
+#ifdef __cplusplus
+
+#ifdef _MSC_VER
+// MSVC won't allow use of std::atomic<> in a union since it has non-trivial
+// copy constructor.
+
+struct kmp_base_ticket_lock {
+  // `initialized' must be the first entry in the lock data structure!
+  std::atomic_bool initialized;
+  volatile union kmp_ticket_lock *self; // points to the lock union
+  ident_t const *location; // Source code location of omp_init_lock().
+  std::atomic_uint
+      next_ticket; // ticket number to give to next thread which acquires
+  std::atomic_uint now_serving; // ticket number for thread which holds the lock
+  std::atomic_int owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  std::atomic_int depth_locked; // depth locked, for nested locks only
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+#else
+struct kmp_base_ticket_lock {
+  // `initialized' must be the first entry in the lock data structure!
+  std::atomic<bool> initialized;
+  volatile union kmp_ticket_lock *self; // points to the lock union
+  ident_t const *location; // Source code location of omp_init_lock().
+  std::atomic<unsigned>
+      next_ticket; // ticket number to give to next thread which acquires
+  std::atomic<unsigned>
+      now_serving; // ticket number for thread which holds the lock
+  std::atomic<int> owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  std::atomic<int> depth_locked; // depth locked, for nested locks only
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+#endif
+
+#else // __cplusplus
+
+struct kmp_base_ticket_lock;
+
+#endif // !__cplusplus
+
+typedef struct kmp_base_ticket_lock kmp_base_ticket_lock_t;
+
+union KMP_ALIGN_CACHE kmp_ticket_lock {
+  kmp_base_ticket_lock_t
+      lk; // This field must be first to allow static initializing.
+  kmp_lock_pool_t pool;
+  double lk_align; // use worst case alignment
+  char lk_pad[KMP_PAD(kmp_base_ticket_lock_t, CACHE_LINE)];
+};
+
+typedef union kmp_ticket_lock kmp_ticket_lock_t;
+
+// Static initializer for simple ticket lock variables. Usage:
+//    kmp_ticket_lock_t xlock = KMP_TICKET_LOCK_INITIALIZER( xlock );
+// Note the macro argument. It is important to make var properly initialized.
+#define KMP_TICKET_LOCK_INITIALIZER(lock)                                      \
+  {                                                                            \
+    {                                                                          \
+      ATOMIC_VAR_INIT(true)                                                    \
+      , &(lock), NULL, ATOMIC_VAR_INIT(0U), ATOMIC_VAR_INIT(0U),               \
+          ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(-1)                              \
+    }                                                                          \
+  }
+
+extern int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_ticket_lock_with_cheks(kmp_ticket_lock_t *lck,
+                                             kmp_int32 gtid);
+extern int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck);
+extern void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck);
+
+extern int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck,
+                                            kmp_int32 gtid);
+extern int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck,
+                                         kmp_int32 gtid);
+extern int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck,
+                                            kmp_int32 gtid);
+extern void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck);
+extern void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck);
+
+// ----------------------------------------------------------------------------
+// Queuing locks.
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_info;
+
+typedef struct kmp_adaptive_lock_info kmp_adaptive_lock_info_t;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_statistics {
+  /* So we can get stats from locks that haven't been destroyed. */
+  kmp_adaptive_lock_info_t *next;
+  kmp_adaptive_lock_info_t *prev;
+
+  /* Other statistics */
+  kmp_uint32 successfulSpeculations;
+  kmp_uint32 hardFailedSpeculations;
+  kmp_uint32 softFailedSpeculations;
+  kmp_uint32 nonSpeculativeAcquires;
+  kmp_uint32 nonSpeculativeAcquireAttempts;
+  kmp_uint32 lemmingYields;
+};
+
+typedef struct kmp_adaptive_lock_statistics kmp_adaptive_lock_statistics_t;
+
+extern void __kmp_print_speculative_stats();
+extern void __kmp_init_speculative_stats();
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+struct kmp_adaptive_lock_info {
+  /* Values used for adaptivity.
+     Although these are accessed from multiple threads we don't access them
+     atomically, because if we miss updates it probably doesn't matter much. (It
+     just affects our decision about whether to try speculation on the lock). */
+  kmp_uint32 volatile badness;
+  kmp_uint32 volatile acquire_attempts;
+  /* Parameters of the lock. */
+  kmp_uint32 max_badness;
+  kmp_uint32 max_soft_retries;
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  kmp_adaptive_lock_statistics_t volatile stats;
+#endif
+};
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+struct kmp_base_queuing_lock {
+
+  //  `initialized' must be the first entry in the lock data structure!
+  volatile union kmp_queuing_lock
+      *initialized; // Points to the lock union if in initialized state.
+
+  ident_t const *location; // Source code location of omp_init_lock().
+
+  KMP_ALIGN(8) // tail_id  must be 8-byte aligned!
+
+  volatile kmp_int32
+      tail_id; // (gtid+1) of thread at tail of wait queue, 0 if empty
+  // Must be no padding here since head/tail used in 8-byte CAS
+  volatile kmp_int32
+      head_id; // (gtid+1) of thread at head of wait queue, 0 if empty
+  // Decl order assumes little endian
+  // bakery-style lock
+  volatile kmp_uint32
+      next_ticket; // ticket number to give to next thread which acquires
+  volatile kmp_uint32
+      now_serving; // ticket number for thread which holds the lock
+  volatile kmp_int32 owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  kmp_int32 depth_locked; // depth locked, for nested locks only
+
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+
+typedef struct kmp_base_queuing_lock kmp_base_queuing_lock_t;
+
+KMP_BUILD_ASSERT(offsetof(kmp_base_queuing_lock_t, tail_id) % 8 == 0);
+
+union KMP_ALIGN_CACHE kmp_queuing_lock {
+  kmp_base_queuing_lock_t
+      lk; // This field must be first to allow static initializing.
+  kmp_lock_pool_t pool;
+  double lk_align; // use worst case alignment
+  char lk_pad[KMP_PAD(kmp_base_queuing_lock_t, CACHE_LINE)];
+};
+
+typedef union kmp_queuing_lock kmp_queuing_lock_t;
+
+extern int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck);
+extern void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck);
+
+extern int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck,
+                                             kmp_int32 gtid);
+extern int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck,
+                                          kmp_int32 gtid);
+extern int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck,
+                                             kmp_int32 gtid);
+extern void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck);
+extern void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck);
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// ----------------------------------------------------------------------------
+// Adaptive locks.
+struct kmp_base_adaptive_lock {
+  kmp_base_queuing_lock qlk;
+  KMP_ALIGN(CACHE_LINE)
+  kmp_adaptive_lock_info_t
+      adaptive; // Information for the speculative adaptive lock
+};
+
+typedef struct kmp_base_adaptive_lock kmp_base_adaptive_lock_t;
+
+union KMP_ALIGN_CACHE kmp_adaptive_lock {
+  kmp_base_adaptive_lock_t lk;
+  kmp_lock_pool_t pool;
+  double lk_align;
+  char lk_pad[KMP_PAD(kmp_base_adaptive_lock_t, CACHE_LINE)];
+};
+typedef union kmp_adaptive_lock kmp_adaptive_lock_t;
+
+#define GET_QLK_PTR(l) ((kmp_queuing_lock_t *)&(l)->lk.qlk)
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+// ----------------------------------------------------------------------------
+// DRDPA ticket locks.
+struct kmp_base_drdpa_lock {
+  // All of the fields on the first cache line are only written when
+  // initializing or reconfiguring the lock.  These are relatively rare
+  // operations, so data from the first cache line will usually stay resident in
+  // the cache of each thread trying to acquire the lock.
+  //
+  // initialized must be the first entry in the lock data structure!
+  KMP_ALIGN_CACHE
+
+  volatile union kmp_drdpa_lock
+      *initialized; // points to the lock union if in initialized state
+  ident_t const *location; // Source code location of omp_init_lock().
+  std::atomic<std::atomic<kmp_uint64> *> polls;
+  std::atomic<kmp_uint64> mask; // is 2**num_polls-1 for mod op
+  kmp_uint64 cleanup_ticket; // thread with cleanup ticket
+  std::atomic<kmp_uint64> *old_polls; // will deallocate old_polls
+  kmp_uint32 num_polls; // must be power of 2
+
+  // next_ticket it needs to exist in a separate cache line, as it is
+  // invalidated every time a thread takes a new ticket.
+  KMP_ALIGN_CACHE
+
+  std::atomic<kmp_uint64> next_ticket;
+
+  // now_serving is used to store our ticket value while we hold the lock. It
+  // has a slightly different meaning in the DRDPA ticket locks (where it is
+  // written by the acquiring thread) than it does in the simple ticket locks
+  // (where it is written by the releasing thread).
+  //
+  // Since now_serving is only read an written in the critical section,
+  // it is non-volatile, but it needs to exist on a separate cache line,
+  // as it is invalidated at every lock acquire.
+  //
+  // Likewise, the vars used for nested locks (owner_id and depth_locked) are
+  // only written by the thread owning the lock, so they are put in this cache
+  // line.  owner_id is read by other threads, so it must be declared volatile.
+  KMP_ALIGN_CACHE
+  kmp_uint64 now_serving; // doesn't have to be volatile
+  volatile kmp_uint32 owner_id; // (gtid+1) of owning thread, 0 if unlocked
+  kmp_int32 depth_locked; // depth locked
+  kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
+};
+
+typedef struct kmp_base_drdpa_lock kmp_base_drdpa_lock_t;
+
+union KMP_ALIGN_CACHE kmp_drdpa_lock {
+  kmp_base_drdpa_lock_t
+      lk; // This field must be first to allow static initializing. */
+  kmp_lock_pool_t pool;
+  double lk_align; // use worst case alignment
+  char lk_pad[KMP_PAD(kmp_base_drdpa_lock_t, CACHE_LINE)];
+};
+
+typedef union kmp_drdpa_lock kmp_drdpa_lock_t;
+
+extern int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck);
+extern void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck);
+
+extern int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck,
+                                           kmp_int32 gtid);
+extern int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
+extern int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck,
+                                           kmp_int32 gtid);
+extern void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
+extern void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
+
+// ============================================================================
+// Lock purposes.
+// ============================================================================
+
+// Bootstrap locks.
+//
+// Bootstrap locks -- very few locks used at library initialization time.
+// Bootstrap locks are currently implemented as ticket locks.
+// They could also be implemented as test and set lock, but cannot be
+// implemented with other lock kinds as they require gtids which are not
+// available at initialization time.
+
+typedef kmp_ticket_lock_t kmp_bootstrap_lock_t;
+
+#define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock) KMP_TICKET_LOCK_INITIALIZER((lock))
+#define KMP_BOOTSTRAP_LOCK_INIT(lock)                                          \
+  kmp_bootstrap_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
+
+static inline int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  return __kmp_acquire_ticket_lock(lck, KMP_GTID_DNE);
+}
+
+static inline int __kmp_test_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  return __kmp_test_ticket_lock(lck, KMP_GTID_DNE);
+}
+
+static inline void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_release_ticket_lock(lck, KMP_GTID_DNE);
+}
+
+static inline void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+}
+
+static inline void __kmp_destroy_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
+  __kmp_destroy_ticket_lock(lck);
+}
+
+// Internal RTL locks.
+//
+// Internal RTL locks are also implemented as ticket locks, for now.
+//
+// FIXME - We should go through and figure out which lock kind works best for
+// each internal lock, and use the type declaration and function calls for
+// that explicit lock kind (and get rid of this section).
+
+typedef kmp_ticket_lock_t kmp_lock_t;
+
+#define KMP_LOCK_INIT(lock) kmp_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
+
+static inline int __kmp_acquire_lock(kmp_lock_t *lck, kmp_int32 gtid) {
+  return __kmp_acquire_ticket_lock(lck, gtid);
+}
+
+static inline int __kmp_test_lock(kmp_lock_t *lck, kmp_int32 gtid) {
+  return __kmp_test_ticket_lock(lck, gtid);
+}
+
+static inline void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid) {
+  __kmp_release_ticket_lock(lck, gtid);
+}
+
+static inline void __kmp_init_lock(kmp_lock_t *lck) {
+  __kmp_init_ticket_lock(lck);
+}
+
+static inline void __kmp_destroy_lock(kmp_lock_t *lck) {
+  __kmp_destroy_ticket_lock(lck);
+}
+
+// User locks.
+//
+// Do not allocate objects of type union kmp_user_lock!!! This will waste space
+// unless __kmp_user_lock_kind == lk_drdpa. Instead, check the value of
+// __kmp_user_lock_kind and allocate objects of the type of the appropriate
+// union member, and cast their addresses to kmp_user_lock_p.
+
+enum kmp_lock_kind {
+  lk_default = 0,
+  lk_tas,
+#if KMP_USE_FUTEX
+  lk_futex,
+#endif
+#if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
+  lk_hle,
+  lk_rtm,
+#endif
+  lk_ticket,
+  lk_queuing,
+  lk_drdpa,
+#if KMP_USE_ADAPTIVE_LOCKS
+  lk_adaptive
+#endif // KMP_USE_ADAPTIVE_LOCKS
+};
+
+typedef enum kmp_lock_kind kmp_lock_kind_t;
+
+extern kmp_lock_kind_t __kmp_user_lock_kind;
+
+union kmp_user_lock {
+  kmp_tas_lock_t tas;
+#if KMP_USE_FUTEX
+  kmp_futex_lock_t futex;
+#endif
+  kmp_ticket_lock_t ticket;
+  kmp_queuing_lock_t queuing;
+  kmp_drdpa_lock_t drdpa;
+#if KMP_USE_ADAPTIVE_LOCKS
+  kmp_adaptive_lock_t adaptive;
+#endif // KMP_USE_ADAPTIVE_LOCKS
+  kmp_lock_pool_t pool;
+};
+
+typedef union kmp_user_lock *kmp_user_lock_p;
+
+#if !KMP_USE_DYNAMIC_LOCK
+
+extern size_t __kmp_base_user_lock_size;
+extern size_t __kmp_user_lock_size;
+
+extern kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck);
+
+static inline kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_get_user_lock_owner_ != NULL);
+  return (*__kmp_get_user_lock_owner_)(lck);
+}
+
+extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid);
+
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+#define __kmp_acquire_user_lock_with_checks(lck, gtid)                         \
+  if (__kmp_user_lock_kind == lk_tas) {                                        \
+    if (__kmp_env_consistency_check) {                                         \
+      char const *const func = "omp_set_lock";                                 \
+      if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&                       \
+          lck->tas.lk.depth_locked != -1) {                                    \
+        KMP_FATAL(LockNestableUsedAsSimple, func);                             \
+      }                                                                        \
+      if ((gtid >= 0) && (lck->tas.lk.poll - 1 == gtid)) {                     \
+        KMP_FATAL(LockIsAlreadyOwned, func);                                   \
+      }                                                                        \
+    }                                                                          \
+    if (lck->tas.lk.poll != 0 ||                                               \
+        !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {     \
+      kmp_uint32 spins;                                                        \
+      KMP_FSYNC_PREPARE(lck);                                                  \
+      KMP_INIT_YIELD(spins);                                                   \
+      do {                                                                     \
+        KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                    \
+      } while (                                                                \
+          lck->tas.lk.poll != 0 ||                                             \
+          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));    \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(lck);                                                   \
+  } else {                                                                     \
+    KMP_DEBUG_ASSERT(__kmp_acquire_user_lock_with_checks_ != NULL);            \
+    (*__kmp_acquire_user_lock_with_checks_)(lck, gtid);                        \
+  }
+
+#else
+static inline int __kmp_acquire_user_lock_with_checks(kmp_user_lock_p lck,
+                                                      kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_acquire_user_lock_with_checks_ != NULL);
+  return (*__kmp_acquire_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                kmp_int32 gtid);
+
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+
+#include "kmp_i18n.h" /* AC: KMP_FATAL definition */
+extern int __kmp_env_consistency_check; /* AC: copy from kmp.h here */
+static inline int __kmp_test_user_lock_with_checks(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) {
+  if (__kmp_user_lock_kind == lk_tas) {
+    if (__kmp_env_consistency_check) {
+      char const *const func = "omp_test_lock";
+      if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
+          lck->tas.lk.depth_locked != -1) {
+        KMP_FATAL(LockNestableUsedAsSimple, func);
+      }
+    }
+    return ((lck->tas.lk.poll == 0) &&
+            __kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
+  } else {
+    KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
+    return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
+  }
+}
+#else
+static inline int __kmp_test_user_lock_with_checks(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
+  return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                   kmp_int32 gtid);
+
+static inline void __kmp_release_user_lock_with_checks(kmp_user_lock_p lck,
+                                                       kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_release_user_lock_with_checks_ != NULL);
+  (*__kmp_release_user_lock_with_checks_)(lck, gtid);
+}
+
+extern void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void __kmp_init_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_init_user_lock_with_checks_ != NULL);
+  (*__kmp_init_user_lock_with_checks_)(lck);
+}
+
+// We need a non-checking version of destroy lock for when the RTL is
+// doing the cleanup as it can't always tell if the lock is nested or not.
+extern void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck);
+
+static inline void __kmp_destroy_user_lock(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_destroy_user_lock_ != NULL);
+  (*__kmp_destroy_user_lock_)(lck);
+}
+
+extern void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void __kmp_destroy_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_destroy_user_lock_with_checks_ != NULL);
+  (*__kmp_destroy_user_lock_with_checks_)(lck);
+}
+
+extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid);
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+
+#define __kmp_acquire_nested_user_lock_with_checks(lck, gtid, depth)           \
+  if (__kmp_user_lock_kind == lk_tas) {                                        \
+    if (__kmp_env_consistency_check) {                                         \
+      char const *const func = "omp_set_nest_lock";                            \
+      if ((sizeof(kmp_tas_lock_t) <= OMP_NEST_LOCK_T_SIZE) &&                  \
+          lck->tas.lk.depth_locked == -1) {                                    \
+        KMP_FATAL(LockSimpleUsedAsNestable, func);                             \
+      }                                                                        \
+    }                                                                          \
+    if (lck->tas.lk.poll - 1 == gtid) {                                        \
+      lck->tas.lk.depth_locked += 1;                                           \
+      *depth = KMP_LOCK_ACQUIRED_NEXT;                                         \
+    } else {                                                                   \
+      if ((lck->tas.lk.poll != 0) ||                                           \
+          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {   \
+        kmp_uint32 spins;                                                      \
+        KMP_FSYNC_PREPARE(lck);                                                \
+        KMP_INIT_YIELD(spins);                                                 \
+        do {                                                                   \
+          KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                  \
+        } while (                                                              \
+            (lck->tas.lk.poll != 0) ||                                         \
+            !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));  \
+      }                                                                        \
+      lck->tas.lk.depth_locked = 1;                                            \
+      *depth = KMP_LOCK_ACQUIRED_FIRST;                                        \
+    }                                                                          \
+    KMP_FSYNC_ACQUIRED(lck);                                                   \
+  } else {                                                                     \
+    KMP_DEBUG_ASSERT(__kmp_acquire_nested_user_lock_with_checks_ != NULL);     \
+    *depth = (*__kmp_acquire_nested_user_lock_with_checks_)(lck, gtid);        \
+  }
+
+#else
+static inline void
+__kmp_acquire_nested_user_lock_with_checks(kmp_user_lock_p lck, kmp_int32 gtid,
+                                           int *depth) {
+  KMP_DEBUG_ASSERT(__kmp_acquire_nested_user_lock_with_checks_ != NULL);
+  *depth = (*__kmp_acquire_nested_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                       kmp_int32 gtid);
+
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+static inline int __kmp_test_nested_user_lock_with_checks(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid) {
+  if (__kmp_user_lock_kind == lk_tas) {
+    int retval;
+    if (__kmp_env_consistency_check) {
+      char const *const func = "omp_test_nest_lock";
+      if ((sizeof(kmp_tas_lock_t) <= OMP_NEST_LOCK_T_SIZE) &&
+          lck->tas.lk.depth_locked == -1) {
+        KMP_FATAL(LockSimpleUsedAsNestable, func);
+      }
+    }
+    KMP_DEBUG_ASSERT(gtid >= 0);
+    if (lck->tas.lk.poll - 1 ==
+        gtid) { /* __kmp_get_tas_lock_owner( lck ) == gtid */
+      return ++lck->tas.lk.depth_locked; /* same owner, depth increased */
+    }
+    retval = ((lck->tas.lk.poll == 0) &&
+              __kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
+    if (retval) {
+      KMP_MB();
+      lck->tas.lk.depth_locked = 1;
+    }
+    return retval;
+  } else {
+    KMP_DEBUG_ASSERT(__kmp_test_nested_user_lock_with_checks_ != NULL);
+    return (*__kmp_test_nested_user_lock_with_checks_)(lck, gtid);
+  }
+}
+#else
+static inline int __kmp_test_nested_user_lock_with_checks(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_test_nested_user_lock_with_checks_ != NULL);
+  return (*__kmp_test_nested_user_lock_with_checks_)(lck, gtid);
+}
+#endif
+
+extern int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
+                                                          kmp_int32 gtid);
+
+static inline int
+__kmp_release_nested_user_lock_with_checks(kmp_user_lock_p lck,
+                                           kmp_int32 gtid) {
+  KMP_DEBUG_ASSERT(__kmp_release_nested_user_lock_with_checks_ != NULL);
+  return (*__kmp_release_nested_user_lock_with_checks_)(lck, gtid);
+}
+
+extern void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void
+__kmp_init_nested_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_init_nested_user_lock_with_checks_ != NULL);
+  (*__kmp_init_nested_user_lock_with_checks_)(lck);
+}
+
+extern void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck);
+
+static inline void
+__kmp_destroy_nested_user_lock_with_checks(kmp_user_lock_p lck) {
+  KMP_DEBUG_ASSERT(__kmp_destroy_nested_user_lock_with_checks_ != NULL);
+  (*__kmp_destroy_nested_user_lock_with_checks_)(lck);
+}
+
+// user lock functions which do not necessarily exist for all lock kinds.
+//
+// The "set" functions usually have wrapper routines that check for a NULL set
+// function pointer and call it if non-NULL.
+//
+// In some cases, it makes sense to have a "get" wrapper function check for a
+// NULL get function pointer and return NULL / invalid value / error code if
+// the function pointer is NULL.
+//
+// In other cases, the calling code really should differentiate between an
+// unimplemented function and one that is implemented but returning NULL /
+// invalied value.  If this is the case, no get function wrapper exists.
+
+extern int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck);
+
+// no set function; fields set durining local allocation
+
+extern const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck);
+
+static inline const ident_t *__kmp_get_user_lock_location(kmp_user_lock_p lck) {
+  if (__kmp_get_user_lock_location_ != NULL) {
+    return (*__kmp_get_user_lock_location_)(lck);
+  } else {
+    return NULL;
+  }
+}
+
+extern void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
+                                             const ident_t *loc);
+
+static inline void __kmp_set_user_lock_location(kmp_user_lock_p lck,
+                                                const ident_t *loc) {
+  if (__kmp_set_user_lock_location_ != NULL) {
+    (*__kmp_set_user_lock_location_)(lck, loc);
+  }
+}
+
+extern kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck);
+
+extern void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
+                                          kmp_lock_flags_t flags);
+
+static inline void __kmp_set_user_lock_flags(kmp_user_lock_p lck,
+                                             kmp_lock_flags_t flags) {
+  if (__kmp_set_user_lock_flags_ != NULL) {
+    (*__kmp_set_user_lock_flags_)(lck, flags);
+  }
+}
+
+// The fuction which sets up all of the vtbl pointers for kmp_user_lock_t.
+extern void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind);
+
+// Macros for binding user lock functions.
+#define KMP_BIND_USER_LOCK_TEMPLATE(nest, kind, suffix)                        \
+  {                                                                            \
+    __kmp_acquire##nest##user_lock_with_checks_ = (int (*)(                    \
+        kmp_user_lock_p, kmp_int32))__kmp_acquire##nest##kind##_##suffix;      \
+    __kmp_release##nest##user_lock_with_checks_ = (int (*)(                    \
+        kmp_user_lock_p, kmp_int32))__kmp_release##nest##kind##_##suffix;      \
+    __kmp_test##nest##user_lock_with_checks_ = (int (*)(                       \
+        kmp_user_lock_p, kmp_int32))__kmp_test##nest##kind##_##suffix;         \
+    __kmp_init##nest##user_lock_with_checks_ =                                 \
+        (void (*)(kmp_user_lock_p))__kmp_init##nest##kind##_##suffix;          \
+    __kmp_destroy##nest##user_lock_with_checks_ =                              \
+        (void (*)(kmp_user_lock_p))__kmp_destroy##nest##kind##_##suffix;       \
+  }
+
+#define KMP_BIND_USER_LOCK(kind) KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock)
+#define KMP_BIND_USER_LOCK_WITH_CHECKS(kind)                                   \
+  KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock_with_checks)
+#define KMP_BIND_NESTED_USER_LOCK(kind)                                        \
+  KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock)
+#define KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(kind)                            \
+  KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock_with_checks)
+
+// User lock table & lock allocation
+/* On 64-bit Linux* OS (and OS X*) GNU compiler allocates only 4 bytems memory
+   for lock variable, which is not enough to store a pointer, so we have to use
+   lock indexes instead of pointers and maintain lock table to map indexes to
+   pointers.
+
+
+   Note: The first element of the table is not a pointer to lock! It is a
+   pointer to previously allocated table (or NULL if it is the first table).
+
+   Usage:
+
+   if ( OMP_LOCK_T_SIZE < sizeof( <lock> ) ) { // or OMP_NEST_LOCK_T_SIZE
+     Lock table is fully utilized. User locks are indexes, so table is used on
+     user lock operation.
+     Note: it may be the case (lin_32) that we don't need to use a lock
+     table for regular locks, but do need the table for nested locks.
+   }
+   else {
+     Lock table initialized but not actually used.
+   }
+*/
+
+struct kmp_lock_table {
+  kmp_lock_index_t used; // Number of used elements
+  kmp_lock_index_t allocated; // Number of allocated elements
+  kmp_user_lock_p *table; // Lock table.
+};
+
+typedef struct kmp_lock_table kmp_lock_table_t;
+
+extern kmp_lock_table_t __kmp_user_lock_table;
+extern kmp_user_lock_p __kmp_lock_pool;
+
+struct kmp_block_of_locks {
+  struct kmp_block_of_locks *next_block;
+  void *locks;
+};
+
+typedef struct kmp_block_of_locks kmp_block_of_locks_t;
+
+extern kmp_block_of_locks_t *__kmp_lock_blocks;
+extern int __kmp_num_locks_in_block;
+
+extern kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock,
+                                                kmp_int32 gtid,
+                                                kmp_lock_flags_t flags);
+extern void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
+                                 kmp_user_lock_p lck);
+extern kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock,
+                                              char const *func);
+extern void __kmp_cleanup_user_locks();
+
+#define KMP_CHECK_USER_LOCK_INIT()                                             \
+  {                                                                            \
+    if (!TCR_4(__kmp_init_user_locks)) {                                       \
+      __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);                         \
+      if (!TCR_4(__kmp_init_user_locks)) {                                     \
+        TCW_4(__kmp_init_user_locks, TRUE);                                    \
+      }                                                                        \
+      __kmp_release_bootstrap_lock(&__kmp_initz_lock);                         \
+    }                                                                          \
+  }
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+#undef KMP_PAD
+#undef KMP_GTID_DNE
+
+#if KMP_USE_DYNAMIC_LOCK
+// KMP_USE_DYNAMIC_LOCK enables dynamic dispatch of lock functions without
+// breaking the current compatibility. Essential functionality of this new code
+// is dynamic dispatch, but it also implements (or enables implementation of)
+// hinted user lock and critical section which will be part of OMP 4.5 soon.
+//
+// Lock type can be decided at creation time (i.e., lock initialization), and
+// subsequent lock function call on the created lock object requires type
+// extraction and call through jump table using the extracted type. This type
+// information is stored in two different ways depending on the size of the lock
+// object, and we differentiate lock types by this size requirement - direct and
+// indirect locks.
+//
+// Direct locks:
+// A direct lock object fits into the space created by the compiler for an
+// omp_lock_t object, and TAS/Futex lock falls into this category. We use low
+// one byte of the lock object as the storage for the lock type, and appropriate
+// bit operation is required to access the data meaningful to the lock
+// algorithms. Also, to differentiate direct lock from indirect lock, 1 is
+// written to LSB of the lock object. The newly introduced "hle" lock is also a
+// direct lock.
+//
+// Indirect locks:
+// An indirect lock object requires more space than the compiler-generated
+// space, and it should be allocated from heap. Depending on the size of the
+// compiler-generated space for the lock (i.e., size of omp_lock_t), this
+// omp_lock_t object stores either the address of the heap-allocated indirect
+// lock (void * fits in the object) or an index to the indirect lock table entry
+// that holds the address. Ticket/Queuing/DRDPA/Adaptive lock falls into this
+// category, and the newly introduced "rtm" lock is also an indirect lock which
+// was implemented on top of the Queuing lock. When the omp_lock_t object holds
+// an index (not lock address), 0 is written to LSB to differentiate the lock
+// from a direct lock, and the remaining part is the actual index to the
+// indirect lock table.
+
+#include <stdint.h> // for uintptr_t
+
+// Shortcuts
+#define KMP_USE_INLINED_TAS                                                    \
+  (KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)) && 1
+#define KMP_USE_INLINED_FUTEX KMP_USE_FUTEX && 0
+
+// List of lock definitions; all nested locks are indirect locks.
+// hle lock is xchg lock prefixed with XACQUIRE/XRELEASE.
+// All nested locks are indirect lock types.
+#if KMP_USE_TSX
+#if KMP_USE_FUTEX
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a)              \
+      m(nested_tas, a) m(nested_futex, a) m(nested_ticket, a)                  \
+          m(nested_queuing, a) m(nested_drdpa, a)
+#else
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(hle, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a)              \
+      m(nested_tas, a) m(nested_ticket, a) m(nested_queuing, a)                \
+          m(nested_drdpa, a)
+#endif // KMP_USE_FUTEX
+#define KMP_LAST_D_LOCK lockseq_hle
+#else
+#if KMP_USE_FUTEX
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(drdpa, a) m(nested_tas, a) m(nested_futex, a)   \
+      m(nested_ticket, a) m(nested_queuing, a) m(nested_drdpa, a)
+#define KMP_LAST_D_LOCK lockseq_futex
+#else
+#define KMP_FOREACH_D_LOCK(m, a) m(tas, a)
+#define KMP_FOREACH_I_LOCK(m, a)                                               \
+  m(ticket, a) m(queuing, a) m(drdpa, a) m(nested_tas, a) m(nested_ticket, a)  \
+      m(nested_queuing, a) m(nested_drdpa, a)
+#define KMP_LAST_D_LOCK lockseq_tas
+#endif // KMP_USE_FUTEX
+#endif // KMP_USE_TSX
+
+// Information used in dynamic dispatch
+#define KMP_LOCK_SHIFT                                                         \
+  8 // number of low bits to be used as tag for direct locks
+#define KMP_FIRST_D_LOCK lockseq_tas
+#define KMP_FIRST_I_LOCK lockseq_ticket
+#define KMP_LAST_I_LOCK lockseq_nested_drdpa
+#define KMP_NUM_I_LOCKS                                                        \
+  (locktag_nested_drdpa + 1) // number of indirect lock types
+
+// Base type for dynamic locks.
+typedef kmp_uint32 kmp_dyna_lock_t;
+
+// Lock sequence that enumerates all lock kinds. Always make this enumeration
+// consistent with kmp_lockseq_t in the include directory.
+typedef enum {
+  lockseq_indirect = 0,
+#define expand_seq(l, a) lockseq_##l,
+  KMP_FOREACH_D_LOCK(expand_seq, 0) KMP_FOREACH_I_LOCK(expand_seq, 0)
+#undef expand_seq
+} kmp_dyna_lockseq_t;
+
+// Enumerates indirect lock tags.
+typedef enum {
+#define expand_tag(l, a) locktag_##l,
+  KMP_FOREACH_I_LOCK(expand_tag, 0)
+#undef expand_tag
+} kmp_indirect_locktag_t;
+
+// Utility macros that extract information from lock sequences.
+#define KMP_IS_D_LOCK(seq)                                                     \
+  ((seq) >= KMP_FIRST_D_LOCK && (seq) <= KMP_LAST_D_LOCK)
+#define KMP_IS_I_LOCK(seq)                                                     \
+  ((seq) >= KMP_FIRST_I_LOCK && (seq) <= KMP_LAST_I_LOCK)
+#define KMP_GET_I_TAG(seq) (kmp_indirect_locktag_t)((seq)-KMP_FIRST_I_LOCK)
+#define KMP_GET_D_TAG(seq) ((seq) << 1 | 1)
+
+// Enumerates direct lock tags starting from indirect tag.
+typedef enum {
+#define expand_tag(l, a) locktag_##l = KMP_GET_D_TAG(lockseq_##l),
+  KMP_FOREACH_D_LOCK(expand_tag, 0)
+#undef expand_tag
+} kmp_direct_locktag_t;
+
+// Indirect lock type
+typedef struct {
+  kmp_user_lock_p lock;
+  kmp_indirect_locktag_t type;
+} kmp_indirect_lock_t;
+
+// Function tables for direct locks. Set/unset/test differentiate functions
+// with/without consistency checking.
+extern void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t);
+extern void (*(*__kmp_direct_destroy))(kmp_dyna_lock_t *);
+extern int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32);
+extern int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32);
+extern int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32);
+
+// Function tables for indirect locks. Set/unset/test differentiate functions
+// with/withuot consistency checking.
+extern void (*__kmp_indirect_init[])(kmp_user_lock_p);
+extern void (*(*__kmp_indirect_destroy))(kmp_user_lock_p);
+extern int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32);
+extern int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32);
+extern int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32);
+
+// Extracts direct lock tag from a user lock pointer
+#define KMP_EXTRACT_D_TAG(l)                                                   \
+  (*((kmp_dyna_lock_t *)(l)) & ((1 << KMP_LOCK_SHIFT) - 1) &                   \
+   -(*((kmp_dyna_lock_t *)(l)) & 1))
+
+// Extracts indirect lock index from a user lock pointer
+#define KMP_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1)
+
+// Returns function pointer to the direct lock function with l (kmp_dyna_lock_t
+// *) and op (operation type).
+#define KMP_D_LOCK_FUNC(l, op) __kmp_direct_##op[KMP_EXTRACT_D_TAG(l)]
+
+// Returns function pointer to the indirect lock function with l
+// (kmp_indirect_lock_t *) and op (operation type).
+#define KMP_I_LOCK_FUNC(l, op)                                                 \
+  __kmp_indirect_##op[((kmp_indirect_lock_t *)(l))->type]
+
+// Initializes a direct lock with the given lock pointer and lock sequence.
+#define KMP_INIT_D_LOCK(l, seq)                                                \
+  __kmp_direct_init[KMP_GET_D_TAG(seq)]((kmp_dyna_lock_t *)l, seq)
+
+// Initializes an indirect lock with the given lock pointer and lock sequence.
+#define KMP_INIT_I_LOCK(l, seq)                                                \
+  __kmp_direct_init[0]((kmp_dyna_lock_t *)(l), seq)
+
+// Returns "free" lock value for the given lock type.
+#define KMP_LOCK_FREE(type) (locktag_##type)
+
+// Returns "busy" lock value for the given lock teyp.
+#define KMP_LOCK_BUSY(v, type) ((v) << KMP_LOCK_SHIFT | locktag_##type)
+
+// Returns lock value after removing (shifting) lock tag.
+#define KMP_LOCK_STRIP(v) ((v) >> KMP_LOCK_SHIFT)
+
+// Initializes global states and data structures for managing dynamic user
+// locks.
+extern void __kmp_init_dynamic_user_locks();
+
+// Allocates and returns an indirect lock with the given indirect lock tag.
+extern kmp_indirect_lock_t *
+__kmp_allocate_indirect_lock(void **, kmp_int32, kmp_indirect_locktag_t);
+
+// Cleans up global states and data structures for managing dynamic user locks.
+extern void __kmp_cleanup_indirect_user_locks();
+
+// Default user lock sequence when not using hinted locks.
+extern kmp_dyna_lockseq_t __kmp_user_lock_seq;
+
+// Jump table for "set lock location", available only for indirect locks.
+extern void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                            const ident_t *);
+#define KMP_SET_I_LOCK_LOCATION(lck, loc)                                      \
+  {                                                                            \
+    if (__kmp_indirect_set_location[(lck)->type] != NULL)                      \
+      __kmp_indirect_set_location[(lck)->type]((lck)->lock, loc);              \
+  }
+
+// Jump table for "set lock flags", available only for indirect locks.
+extern void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
+                                                         kmp_lock_flags_t);
+#define KMP_SET_I_LOCK_FLAGS(lck, flag)                                        \
+  {                                                                            \
+    if (__kmp_indirect_set_flags[(lck)->type] != NULL)                         \
+      __kmp_indirect_set_flags[(lck)->type]((lck)->lock, flag);                \
+  }
+
+// Jump table for "get lock location", available only for indirect locks.
+extern const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p);
+#define KMP_GET_I_LOCK_LOCATION(lck)                                           \
+  (__kmp_indirect_get_location[(lck)->type] != NULL                            \
+       ? __kmp_indirect_get_location[(lck)->type]((lck)->lock)                 \
+       : NULL)
+
+// Jump table for "get lock flags", available only for indirect locks.
+extern kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
+    kmp_user_lock_p);
+#define KMP_GET_I_LOCK_FLAGS(lck)                                              \
+  (__kmp_indirect_get_flags[(lck)->type] != NULL                               \
+       ? __kmp_indirect_get_flags[(lck)->type]((lck)->lock)                    \
+       : NULL)
+
+#define KMP_I_LOCK_CHUNK                                                       \
+  1024 // number of kmp_indirect_lock_t objects to be allocated together
+
+// Lock table for indirect locks.
+typedef struct kmp_indirect_lock_table {
+  kmp_indirect_lock_t **table; // blocks of indirect locks allocated
+  kmp_lock_index_t size; // size of the indirect lock table
+  kmp_lock_index_t next; // index to the next lock to be allocated
+} kmp_indirect_lock_table_t;
+
+extern kmp_indirect_lock_table_t __kmp_i_lock_table;
+
+// Returns the indirect lock associated with the given index.
+#define KMP_GET_I_LOCK(index)                                                  \
+  (*(__kmp_i_lock_table.table + (index) / KMP_I_LOCK_CHUNK) +                  \
+   (index) % KMP_I_LOCK_CHUNK)
+
+// Number of locks in a lock block, which is fixed to "1" now.
+// TODO: No lock block implementation now. If we do support, we need to manage
+// lock block data structure for each indirect lock type.
+extern int __kmp_num_locks_in_block;
+
+// Fast lock table lookup without consistency checking
+#define KMP_LOOKUP_I_LOCK(l)                                                   \
+  ((OMP_LOCK_T_SIZE < sizeof(void *)) ? KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(l)) \
+                                      : *((kmp_indirect_lock_t **)(l)))
+
+// Used once in kmp_error.cpp
+extern kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p, kmp_uint32);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+#define KMP_LOCK_BUSY(v, type) (v)
+#define KMP_LOCK_FREE(type) 0
+#define KMP_LOCK_STRIP(v) (v)
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+// data structure for using backoff within spin locks.
+typedef struct {
+  kmp_uint32 step; // current step
+  kmp_uint32 max_backoff; // upper bound of outer delay loop
+  kmp_uint32 min_tick; // size of inner delay loop in ticks (machine-dependent)
+} kmp_backoff_t;
+
+// Runtime's default backoff parameters
+extern kmp_backoff_t __kmp_spin_backoff_params;
+
+// Backoff function
+extern void __kmp_spin_backoff(kmp_backoff_t *);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_LOCK_H */
diff --git a/final/runtime/src/kmp_omp.h b/final/runtime/src/kmp_omp.h
new file mode 100644
index 0000000..27b550d
--- /dev/null
+++ b/final/runtime/src/kmp_omp.h
@@ -0,0 +1,235 @@
+#if USE_DEBUGGER
+/*
+ * kmp_omp.h -- OpenMP definition for kmp_omp_struct_info_t.
+ *              This is for information about runtime library structures.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* THIS FILE SHOULD NOT BE MODIFIED IN IDB INTERFACE LIBRARY CODE
+   It should instead be modified in the OpenMP runtime and copied to the
+   interface library code.  This way we can minimize the problems that this is
+   sure to cause having two copies of the same file.
+
+   Files live in libomp and libomp_db/src/include  */
+
+/* CHANGE THIS WHEN STRUCTURES BELOW CHANGE
+   Before we release this to a customer, please don't change this value.  After
+   it is released and stable, then any new updates to the structures or data
+   structure traversal algorithms need to change this value. */
+#define KMP_OMP_VERSION 9
+
+typedef struct {
+  kmp_int32 offset;
+  kmp_int32 size;
+} offset_and_size_t;
+
+typedef struct {
+  kmp_uint64 addr;
+  kmp_int32 size;
+  kmp_int32 padding;
+} addr_and_size_t;
+
+typedef struct {
+  kmp_uint64 flags; // Flags for future extensions.
+  kmp_uint64
+      file; // Pointer to name of source file where the parallel region is.
+  kmp_uint64 func; // Pointer to name of routine where the parallel region is.
+  kmp_int32 begin; // Beginning of source line range.
+  kmp_int32 end; // End of source line range.
+  kmp_int32 num_threads; // Specified number of threads.
+} kmp_omp_nthr_item_t;
+
+typedef struct {
+  kmp_int32 num; // Number of items in the arrray.
+  kmp_uint64 array; // Address of array of kmp_omp_num_threads_item_t.
+} kmp_omp_nthr_info_t;
+
+/* This structure is known to the idb interface library */
+typedef struct {
+
+  /* Change this only if you make a fundamental data structure change here */
+  kmp_int32 lib_version;
+
+  /* sanity check.  Only should be checked if versions are identical
+   * This is also used for backward compatibility to get the runtime
+   * structure size if it the runtime is older than the interface */
+  kmp_int32 sizeof_this_structure;
+
+  /* OpenMP RTL version info. */
+  addr_and_size_t major;
+  addr_and_size_t minor;
+  addr_and_size_t build;
+  addr_and_size_t openmp_version;
+  addr_and_size_t banner;
+
+  /* Various globals. */
+  addr_and_size_t threads; // Pointer to __kmp_threads.
+  addr_and_size_t roots; // Pointer to __kmp_root.
+  addr_and_size_t capacity; // Pointer to __kmp_threads_capacity.
+#if KMP_USE_MONITOR
+  addr_and_size_t monitor; // Pointer to __kmp_monitor.
+#endif
+#if !KMP_USE_DYNAMIC_LOCK
+  addr_and_size_t lock_table; // Pointer to __kmp_lock_table.
+#endif
+  addr_and_size_t func_microtask;
+  addr_and_size_t func_fork;
+  addr_and_size_t func_fork_teams;
+  addr_and_size_t team_counter;
+  addr_and_size_t task_counter;
+  addr_and_size_t nthr_info;
+  kmp_int32 address_width;
+  kmp_int32 indexed_locks;
+  kmp_int32 last_barrier; // The end in enum barrier_type
+  kmp_int32 deque_size; // TASK_DEQUE_SIZE
+
+  /* thread structure information. */
+  kmp_int32 th_sizeof_struct;
+  offset_and_size_t th_info; // descriptor for thread
+  offset_and_size_t th_team; // team for this thread
+  offset_and_size_t th_root; // root for this thread
+  offset_and_size_t th_serial_team; // serial team under this thread
+  offset_and_size_t th_ident; // location for this thread (if available)
+  offset_and_size_t th_spin_here; // is thread waiting for lock (if available)
+  offset_and_size_t
+      th_next_waiting; // next thread waiting for lock (if available)
+  offset_and_size_t th_task_team; // task team struct
+  offset_and_size_t th_current_task; // innermost task being executed
+  offset_and_size_t
+      th_task_state; // alternating 0/1 for task team identification
+  offset_and_size_t th_bar;
+  offset_and_size_t th_b_worker_arrived; // the worker increases it by 1 when it
+  // arrives to the barrier
+
+  /* teams information */
+  offset_and_size_t th_teams_microtask; // entry address for teams construct
+  offset_and_size_t th_teams_level; // initial level of teams construct
+  offset_and_size_t th_teams_nteams; // number of teams in a league
+  offset_and_size_t
+      th_teams_nth; // number of threads in each team of the league
+
+  /* kmp_desc structure (for info field above) */
+  kmp_int32 ds_sizeof_struct;
+  offset_and_size_t ds_tid; // team thread id
+  offset_and_size_t ds_gtid; // global thread id
+  offset_and_size_t ds_thread; // native thread id
+
+  /* team structure information */
+  kmp_int32 t_sizeof_struct;
+  offset_and_size_t t_master_tid; // tid of master in parent team
+  offset_and_size_t t_ident; // location of parallel region
+  offset_and_size_t t_parent; // parent team
+  offset_and_size_t t_nproc; // # team threads
+  offset_and_size_t t_threads; // array of threads
+  offset_and_size_t t_serialized; // # levels of serialized teams
+  offset_and_size_t t_id; // unique team id
+  offset_and_size_t t_pkfn;
+  offset_and_size_t t_task_team; // task team structure
+  offset_and_size_t t_implicit_task; // taskdata for the thread's implicit task
+  offset_and_size_t t_cancel_request;
+  offset_and_size_t t_bar;
+  offset_and_size_t
+      t_b_master_arrived; // increased by 1 when master arrives to a barrier
+  offset_and_size_t
+      t_b_team_arrived; // increased by one when all the threads arrived
+
+  /* root structure information */
+  kmp_int32 r_sizeof_struct;
+  offset_and_size_t r_root_team; // team at root
+  offset_and_size_t r_hot_team; // hot team for this root
+  offset_and_size_t r_uber_thread; // root thread
+  offset_and_size_t r_root_id; // unique root id (if available)
+
+  /* ident structure information */
+  kmp_int32 id_sizeof_struct;
+  offset_and_size_t
+      id_psource; /* address of string ";file;func;line1;line2;;". */
+  offset_and_size_t id_flags;
+
+  /* lock structure information */
+  kmp_int32 lk_sizeof_struct;
+  offset_and_size_t lk_initialized;
+  offset_and_size_t lk_location;
+  offset_and_size_t lk_tail_id;
+  offset_and_size_t lk_head_id;
+  offset_and_size_t lk_next_ticket;
+  offset_and_size_t lk_now_serving;
+  offset_and_size_t lk_owner_id;
+  offset_and_size_t lk_depth_locked;
+  offset_and_size_t lk_lock_flags;
+
+#if !KMP_USE_DYNAMIC_LOCK
+  /* lock_table_t */
+  kmp_int32 lt_size_of_struct; /* Size and layout of kmp_lock_table_t. */
+  offset_and_size_t lt_used;
+  offset_and_size_t lt_allocated;
+  offset_and_size_t lt_table;
+#endif
+
+  /* task_team_t */
+  kmp_int32 tt_sizeof_struct;
+  offset_and_size_t tt_threads_data;
+  offset_and_size_t tt_found_tasks;
+  offset_and_size_t tt_nproc;
+  offset_and_size_t tt_unfinished_threads;
+  offset_and_size_t tt_active;
+
+  /* kmp_taskdata_t */
+  kmp_int32 td_sizeof_struct;
+  offset_and_size_t td_task_id; // task id
+  offset_and_size_t td_flags; // task flags
+  offset_and_size_t td_team; // team for this task
+  offset_and_size_t td_parent; // parent task
+  offset_and_size_t td_level; // task testing level
+  offset_and_size_t td_ident; // task identifier
+  offset_and_size_t td_allocated_child_tasks; // child tasks (+ current task)
+  // not yet deallocated
+  offset_and_size_t td_incomplete_child_tasks; // child tasks not yet complete
+
+  /* Taskwait */
+  offset_and_size_t td_taskwait_ident;
+  offset_and_size_t td_taskwait_counter;
+  offset_and_size_t
+      td_taskwait_thread; // gtid + 1 of thread encountered taskwait
+
+  /* Taskgroup */
+  offset_and_size_t td_taskgroup; // pointer to the current taskgroup
+  offset_and_size_t
+      td_task_count; // number of allocated and not yet complete tasks
+  offset_and_size_t td_cancel; // request for cancellation of this taskgroup
+
+  /* Task dependency */
+  offset_and_size_t
+      td_depnode; // pointer to graph node if the task has dependencies
+  offset_and_size_t dn_node;
+  offset_and_size_t dn_next;
+  offset_and_size_t dn_successors;
+  offset_and_size_t dn_task;
+  offset_and_size_t dn_npredecessors;
+  offset_and_size_t dn_nrefs;
+  offset_and_size_t dn_routine;
+
+  /* kmp_thread_data_t */
+  kmp_int32 hd_sizeof_struct;
+  offset_and_size_t hd_deque;
+  offset_and_size_t hd_deque_size;
+  offset_and_size_t hd_deque_head;
+  offset_and_size_t hd_deque_tail;
+  offset_and_size_t hd_deque_ntasks;
+  offset_and_size_t hd_deque_last_stolen;
+
+  // The last field of stable version.
+  kmp_uint64 last_field;
+
+} kmp_omp_struct_info_t;
+
+#endif /* USE_DEBUGGER */
+
+/* end of file */
diff --git a/final/runtime/src/kmp_os.h b/final/runtime/src/kmp_os.h
new file mode 100644
index 0000000..c4c7bcf
--- /dev/null
+++ b/final/runtime/src/kmp_os.h
@@ -0,0 +1,1040 @@
+/*
+ * kmp_os.h -- KPTS runtime header file.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_OS_H
+#define KMP_OS_H
+
+#include "kmp_config.h"
+#include <stdlib.h>
+#include <atomic>
+
+#define KMP_FTN_PLAIN 1
+#define KMP_FTN_APPEND 2
+#define KMP_FTN_UPPER 3
+/*
+#define KMP_FTN_PREPEND 4
+#define KMP_FTN_UAPPEND 5
+*/
+
+#define KMP_PTR_SKIP (sizeof(void *))
+
+/* -------------------------- Compiler variations ------------------------ */
+
+#define KMP_OFF 0
+#define KMP_ON 1
+
+#define KMP_MEM_CONS_VOLATILE 0
+#define KMP_MEM_CONS_FENCE 1
+
+#ifndef KMP_MEM_CONS_MODEL
+#define KMP_MEM_CONS_MODEL KMP_MEM_CONS_VOLATILE
+#endif
+
+#ifndef __has_cpp_attribute
+#define __has_cpp_attribute(x) 0
+#endif
+
+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+
+/* ------------------------- Compiler recognition ---------------------- */
+#define KMP_COMPILER_ICC 0
+#define KMP_COMPILER_GCC 0
+#define KMP_COMPILER_CLANG 0
+#define KMP_COMPILER_MSVC 0
+
+#if defined(__INTEL_COMPILER)
+#undef KMP_COMPILER_ICC
+#define KMP_COMPILER_ICC 1
+#elif defined(__clang__)
+#undef KMP_COMPILER_CLANG
+#define KMP_COMPILER_CLANG 1
+#elif defined(__GNUC__)
+#undef KMP_COMPILER_GCC
+#define KMP_COMPILER_GCC 1
+#elif defined(_MSC_VER)
+#undef KMP_COMPILER_MSVC
+#define KMP_COMPILER_MSVC 1
+#else
+#error Unknown compiler
+#endif
+
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS) && !KMP_OS_CNK
+#define KMP_AFFINITY_SUPPORTED 1
+#if KMP_OS_WINDOWS && KMP_ARCH_X86_64
+#define KMP_GROUP_AFFINITY 1
+#else
+#define KMP_GROUP_AFFINITY 0
+#endif
+#else
+#define KMP_AFFINITY_SUPPORTED 0
+#define KMP_GROUP_AFFINITY 0
+#endif
+
+/* Check for quad-precision extension. */
+#define KMP_HAVE_QUAD 0
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if KMP_COMPILER_ICC
+/* _Quad is already defined for icc */
+#undef KMP_HAVE_QUAD
+#define KMP_HAVE_QUAD 1
+#elif KMP_COMPILER_CLANG
+/* Clang doesn't support a software-implemented
+   128-bit extended precision type yet */
+typedef long double _Quad;
+#elif KMP_COMPILER_GCC
+/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad */
+#if !KMP_OS_NETBSD
+typedef __float128 _Quad;
+#undef KMP_HAVE_QUAD
+#define KMP_HAVE_QUAD 1
+#endif
+#elif KMP_COMPILER_MSVC
+typedef long double _Quad;
+#endif
+#else
+#if __LDBL_MAX_EXP__ >= 16384 && KMP_COMPILER_GCC
+typedef long double _Quad;
+#undef KMP_HAVE_QUAD
+#define KMP_HAVE_QUAD 1
+#endif
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#define KMP_USE_X87CONTROL 0
+#if KMP_OS_WINDOWS
+#define KMP_END_OF_LINE "\r\n"
+typedef char kmp_int8;
+typedef unsigned char kmp_uint8;
+typedef short kmp_int16;
+typedef unsigned short kmp_uint16;
+typedef int kmp_int32;
+typedef unsigned int kmp_uint32;
+#define KMP_INT32_SPEC "d"
+#define KMP_UINT32_SPEC "u"
+#ifndef KMP_STRUCT64
+typedef __int64 kmp_int64;
+typedef unsigned __int64 kmp_uint64;
+#define KMP_INT64_SPEC "I64d"
+#define KMP_UINT64_SPEC "I64u"
+#else
+struct kmp_struct64 {
+  kmp_int32 a, b;
+};
+typedef struct kmp_struct64 kmp_int64;
+typedef struct kmp_struct64 kmp_uint64;
+/* Not sure what to use for KMP_[U]INT64_SPEC here */
+#endif
+#if KMP_ARCH_X86 && KMP_MSVC_COMPAT
+#undef KMP_USE_X87CONTROL
+#define KMP_USE_X87CONTROL 1
+#endif
+#if KMP_ARCH_X86_64
+#define KMP_INTPTR 1
+typedef __int64 kmp_intptr_t;
+typedef unsigned __int64 kmp_uintptr_t;
+#define KMP_INTPTR_SPEC "I64d"
+#define KMP_UINTPTR_SPEC "I64u"
+#endif
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_OS_UNIX
+#define KMP_END_OF_LINE "\n"
+typedef char kmp_int8;
+typedef unsigned char kmp_uint8;
+typedef short kmp_int16;
+typedef unsigned short kmp_uint16;
+typedef int kmp_int32;
+typedef unsigned int kmp_uint32;
+typedef long long kmp_int64;
+typedef unsigned long long kmp_uint64;
+#define KMP_INT32_SPEC "d"
+#define KMP_UINT32_SPEC "u"
+#define KMP_INT64_SPEC "lld"
+#define KMP_UINT64_SPEC "llu"
+#endif /* KMP_OS_UNIX */
+
+#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
+#define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
+#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
+#define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
+#else
+#error "Can't determine size_t printf format specifier."
+#endif
+
+#if KMP_ARCH_X86
+#define KMP_SIZE_T_MAX (0xFFFFFFFF)
+#else
+#define KMP_SIZE_T_MAX (0xFFFFFFFFFFFFFFFF)
+#endif
+
+typedef size_t kmp_size_t;
+typedef float kmp_real32;
+typedef double kmp_real64;
+
+#ifndef KMP_INTPTR
+#define KMP_INTPTR 1
+typedef long kmp_intptr_t;
+typedef unsigned long kmp_uintptr_t;
+#define KMP_INTPTR_SPEC "ld"
+#define KMP_UINTPTR_SPEC "lu"
+#endif
+
+#ifdef BUILD_I8
+typedef kmp_int64 kmp_int;
+typedef kmp_uint64 kmp_uint;
+#else
+typedef kmp_int32 kmp_int;
+typedef kmp_uint32 kmp_uint;
+#endif /* BUILD_I8 */
+#define KMP_INT_MAX ((kmp_int32)0x7FFFFFFF)
+#define KMP_INT_MIN ((kmp_int32)0x80000000)
+
+#ifdef __cplusplus
+// macros to cast out qualifiers and to re-interpret types
+#define CCAST(type, var) const_cast<type>(var)
+#define RCAST(type, var) reinterpret_cast<type>(var)
+//-------------------------------------------------------------------------
+// template for debug prints specification ( d, u, lld, llu ), and to obtain
+// signed/unsigned flavors of a type
+template <typename T> struct traits_t {};
+// int
+template <> struct traits_t<signed int> {
+  typedef signed int signed_t;
+  typedef unsigned int unsigned_t;
+  typedef double floating_t;
+  static char const *spec;
+  static const signed_t max_value = 0x7fffffff;
+  static const signed_t min_value = 0x80000000;
+  static const int type_size = sizeof(signed_t);
+};
+// unsigned int
+template <> struct traits_t<unsigned int> {
+  typedef signed int signed_t;
+  typedef unsigned int unsigned_t;
+  typedef double floating_t;
+  static char const *spec;
+  static const unsigned_t max_value = 0xffffffff;
+  static const unsigned_t min_value = 0x00000000;
+  static const int type_size = sizeof(unsigned_t);
+};
+// long
+template <> struct traits_t<signed long> {
+  typedef signed long signed_t;
+  typedef unsigned long unsigned_t;
+  typedef long double floating_t;
+  static char const *spec;
+  static const int type_size = sizeof(signed_t);
+};
+// long long
+template <> struct traits_t<signed long long> {
+  typedef signed long long signed_t;
+  typedef unsigned long long unsigned_t;
+  typedef long double floating_t;
+  static char const *spec;
+  static const signed_t max_value = 0x7fffffffffffffffLL;
+  static const signed_t min_value = 0x8000000000000000LL;
+  static const int type_size = sizeof(signed_t);
+};
+// unsigned long long
+template <> struct traits_t<unsigned long long> {
+  typedef signed long long signed_t;
+  typedef unsigned long long unsigned_t;
+  typedef long double floating_t;
+  static char const *spec;
+  static const unsigned_t max_value = 0xffffffffffffffffLL;
+  static const unsigned_t min_value = 0x0000000000000000LL;
+  static const int type_size = sizeof(unsigned_t);
+};
+//-------------------------------------------------------------------------
+#else
+#define CCAST(type, var) (type)(var)
+#define RCAST(type, var) (type)(var)
+#endif // __cplusplus
+
+#define KMP_EXPORT extern /* export declaration in guide libraries */
+
+#if __GNUC__ >= 4 && !defined(__MINGW32__)
+#define __forceinline __inline
+#endif
+
+#if KMP_OS_WINDOWS
+#include <windows.h>
+
+static inline int KMP_GET_PAGE_SIZE(void) {
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwPageSize;
+}
+#else
+#define KMP_GET_PAGE_SIZE() getpagesize()
+#endif
+
+#define PAGE_ALIGNED(_addr)                                                    \
+  (!((size_t)_addr & (size_t)(KMP_GET_PAGE_SIZE() - 1)))
+#define ALIGN_TO_PAGE(x)                                                       \
+  (void *)(((size_t)(x)) & ~((size_t)(KMP_GET_PAGE_SIZE() - 1)))
+
+/* ---------- Support for cache alignment, padding, etc. ----------------*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define INTERNODE_CACHE_LINE 4096 /* for multi-node systems */
+
+/* Define the default size of the cache line */
+#ifndef CACHE_LINE
+#define CACHE_LINE 128 /* cache line size in bytes */
+#else
+#if (CACHE_LINE < 64) && !defined(KMP_OS_DARWIN)
+// 2006-02-13: This produces too many warnings on OS X*. Disable for now
+#warning CACHE_LINE is too small.
+#endif
+#endif /* CACHE_LINE */
+
+#define KMP_CACHE_PREFETCH(ADDR) /* nothing */
+
+// Define attribute that indicates that the fall through from the previous
+// case label is intentional and should not be diagnosed by a compiler
+//   Code from libcxx/include/__config
+// Use a function like macro to imply that it must be followed by a semicolon
+#if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
+#  define KMP_FALLTHROUGH() [[fallthrough]]
+#elif __has_cpp_attribute(clang::fallthrough)
+#  define KMP_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_attribute(fallthough) || __GNUC__ >= 7
+#  define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
+#else
+#  define KMP_FALLTHROUGH() ((void)0)
+#endif
+
+// Define attribute that indicates a function does not return
+#if __cplusplus >= 201103L
+#define KMP_NORETURN [[noreturn]]
+#elif KMP_OS_WINDOWS
+#define KMP_NORETURN __declspec(noreturn)
+#else
+#define KMP_NORETURN __attribute__((noreturn))
+#endif
+
+#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
+#define KMP_ALIGN(bytes) __declspec(align(bytes))
+#define KMP_THREAD_LOCAL __declspec(thread)
+#define KMP_ALIAS /* Nothing */
+#else
+#define KMP_ALIGN(bytes) __attribute__((aligned(bytes)))
+#define KMP_THREAD_LOCAL __thread
+#define KMP_ALIAS(alias_of) __attribute__((alias(alias_of)))
+#endif
+
+#if KMP_HAVE_WEAK_ATTRIBUTE
+#define KMP_WEAK_ATTRIBUTE __attribute__((weak))
+#else
+#define KMP_WEAK_ATTRIBUTE /* Nothing */
+#endif
+
+// Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME
+#ifndef KMP_STR
+#define KMP_STR(x) _KMP_STR(x)
+#define _KMP_STR(x) #x
+#endif
+
+#ifdef KMP_USE_VERSION_SYMBOLS
+// If using versioned symbols, KMP_EXPAND_NAME prepends
+// __kmp_api_ to the real API name
+#define KMP_EXPAND_NAME(api_name) _KMP_EXPAND_NAME(api_name)
+#define _KMP_EXPAND_NAME(api_name) __kmp_api_##api_name
+#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str)                         \
+  _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, "VERSION")
+#define _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, default_ver)            \
+  __typeof__(__kmp_api_##api_name) __kmp_api_##api_name##_##ver_num##_alias     \
+      __attribute__((alias(KMP_STR(__kmp_api_##api_name))));                    \
+  __asm__(                                                                      \
+      ".symver " KMP_STR(__kmp_api_##api_name##_##ver_num##_alias) "," KMP_STR( \
+          api_name) "@" ver_str "\n\t");                                        \
+  __asm__(".symver " KMP_STR(__kmp_api_##api_name) "," KMP_STR(                 \
+      api_name) "@@" default_ver "\n\t")
+#else // KMP_USE_VERSION_SYMBOLS
+#define KMP_EXPAND_NAME(api_name) api_name
+#define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) /* Nothing */
+#endif // KMP_USE_VERSION_SYMBOLS
+
+/* Temporary note: if performance testing of this passes, we can remove
+   all references to KMP_DO_ALIGN and replace with KMP_ALIGN.  */
+#define KMP_DO_ALIGN(bytes) KMP_ALIGN(bytes)
+#define KMP_ALIGN_CACHE KMP_ALIGN(CACHE_LINE)
+#define KMP_ALIGN_CACHE_INTERNODE KMP_ALIGN(INTERNODE_CACHE_LINE)
+
+/* General purpose fence types for memory operations */
+enum kmp_mem_fence_type {
+  kmp_no_fence, /* No memory fence */
+  kmp_acquire_fence, /* Acquire (read) memory fence */
+  kmp_release_fence, /* Release (write) memory fence */
+  kmp_full_fence /* Full (read+write) memory fence */
+};
+
+// Synchronization primitives
+
+#if KMP_ASM_INTRINS && KMP_OS_WINDOWS
+
+#if KMP_MSVC_COMPAT && !KMP_COMPILER_CLANG
+#pragma intrinsic(InterlockedExchangeAdd)
+#pragma intrinsic(InterlockedCompareExchange)
+#pragma intrinsic(InterlockedExchange)
+#pragma intrinsic(InterlockedExchange64)
+#endif
+
+// Using InterlockedIncrement / InterlockedDecrement causes a library loading
+// ordering problem, so we use InterlockedExchangeAdd instead.
+#define KMP_TEST_THEN_INC32(p) InterlockedExchangeAdd((volatile long *)(p), 1)
+#define KMP_TEST_THEN_INC_ACQ32(p)                                             \
+  InterlockedExchangeAdd((volatile long *)(p), 1)
+#define KMP_TEST_THEN_ADD4_32(p) InterlockedExchangeAdd((volatile long *)(p), 4)
+#define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
+  InterlockedExchangeAdd((volatile long *)(p), 4)
+#define KMP_TEST_THEN_DEC32(p) InterlockedExchangeAdd((volatile long *)(p), -1)
+#define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
+  InterlockedExchangeAdd((volatile long *)(p), -1)
+#define KMP_TEST_THEN_ADD32(p, v)                                              \
+  InterlockedExchangeAdd((volatile long *)(p), (v))
+
+#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
+  InterlockedCompareExchange((volatile long *)(p), (long)(sv), (long)(cv))
+
+#define KMP_XCHG_FIXED32(p, v)                                                 \
+  InterlockedExchange((volatile long *)(p), (long)(v))
+#define KMP_XCHG_FIXED64(p, v)                                                 \
+  InterlockedExchange64((volatile kmp_int64 *)(p), (kmp_int64)(v))
+
+inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
+  kmp_int32 tmp = InterlockedExchange((volatile long *)p, *(long *)&v);
+  return *(kmp_real32 *)&tmp;
+}
+
+// Routines that we still need to implement in assembly.
+extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
+extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
+
+extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
+                                         kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
+                                           kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
+                                           kmp_int32 sv);
+extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
+                                           kmp_int64 sv);
+extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv);
+extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv);
+
+extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
+extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
+extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
+
+//#define KMP_TEST_THEN_INC32(p) __kmp_test_then_add32((p), 1)
+//#define KMP_TEST_THEN_INC_ACQ32(p) __kmp_test_then_add32((p), 1)
+#define KMP_TEST_THEN_INC64(p) __kmp_test_then_add64((p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p) __kmp_test_then_add64((p), 1LL)
+//#define KMP_TEST_THEN_ADD4_32(p) __kmp_test_then_add32((p), 4)
+//#define KMP_TEST_THEN_ADD4_ACQ32(p) __kmp_test_then_add32((p), 4)
+#define KMP_TEST_THEN_ADD4_64(p) __kmp_test_then_add64((p), 4LL)
+#define KMP_TEST_THEN_ADD4_ACQ64(p) __kmp_test_then_add64((p), 4LL)
+//#define KMP_TEST_THEN_DEC32(p) __kmp_test_then_add32((p), -1)
+//#define KMP_TEST_THEN_DEC_ACQ32(p) __kmp_test_then_add32((p), -1)
+#define KMP_TEST_THEN_DEC64(p) __kmp_test_then_add64((p), -1LL)
+#define KMP_TEST_THEN_DEC_ACQ64(p) __kmp_test_then_add64((p), -1LL)
+//#define KMP_TEST_THEN_ADD32(p, v) __kmp_test_then_add32((p), (v))
+#define KMP_TEST_THEN_ADD8(p, v) __kmp_test_then_add8((p), (v))
+#define KMP_TEST_THEN_ADD64(p, v) __kmp_test_then_add64((p), (v))
+
+#define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v))
+#define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v))
+#define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v))
+#define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v))
+#define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v))
+#define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v))
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+
+#if KMP_ARCH_X86
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#else /* 64 bit pointers */
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#endif /* KMP_ARCH_X86 */
+
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+  __kmp_compare_and_store_ret8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  __kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
+#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
+//#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
+//#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
+//#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
+#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
+
+#elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
+
+/* cast p to correct type so that proper intrinsic will be used */
+#define KMP_TEST_THEN_INC32(p)                                                 \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_INC_ACQ32(p)                                             \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_INC64(p)                                                 \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_INC_ACQ64(p)                                             \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_INC64(p)                                                 \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p)                                             \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
+#endif
+#define KMP_TEST_THEN_ADD4_32(p)                                               \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
+#define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_ADD4_64(p)                                               \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
+  __atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_DEC64(p)                                                 \
+  __atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
+  __atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_ADD4_64(p)                                               \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_DEC64(p)                                                 \
+  __sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
+  __sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
+#endif
+#define KMP_TEST_THEN_DEC32(p)                                                 \
+  __sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
+  __sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_ADD8(p, v)                                               \
+  __sync_fetch_and_add((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_ADD32(p, v)                                              \
+  __sync_fetch_and_add((volatile kmp_int32 *)(p), (kmp_int32)(v))
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_ADD64(p, v)                                              \
+  __atomic_fetch_add((volatile kmp_uint64 *)(p), (kmp_uint64)(v),              \
+                     __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_ADD64(p, v)                                              \
+  __sync_fetch_and_add((volatile kmp_int64 *)(p), (kmp_int64)(v))
+#endif
+
+#define KMP_TEST_THEN_OR8(p, v)                                                \
+  __sync_fetch_and_or((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_AND8(p, v)                                               \
+  __sync_fetch_and_and((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_OR32(p, v)                                               \
+  __sync_fetch_and_or((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_TEST_THEN_AND32(p, v)                                              \
+  __sync_fetch_and_and((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#if KMP_ARCH_MIPS
+#define KMP_TEST_THEN_OR64(p, v)                                               \
+  __atomic_fetch_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v),               \
+                    __ATOMIC_SEQ_CST)
+#define KMP_TEST_THEN_AND64(p, v)                                              \
+  __atomic_fetch_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v),              \
+                     __ATOMIC_SEQ_CST)
+#else
+#define KMP_TEST_THEN_OR64(p, v)                                               \
+  __sync_fetch_and_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+#define KMP_TEST_THEN_AND64(p, v)                                              \
+  __sync_fetch_and_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+#endif
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),     \
+                               (kmp_uint8)(sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),     \
+                               (kmp_uint8)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),   \
+                               (kmp_uint16)(sv))
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),   \
+                               (kmp_uint16)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),   \
+                               (kmp_uint32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),   \
+                               (kmp_uint32)(sv))
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __sync_bool_compare_and_swap((void *volatile *)(p), (void *)(cv),            \
+                               (void *)(sv))
+
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+  __sync_val_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),      \
+                              (kmp_uint8)(sv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  __sync_val_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),    \
+                              (kmp_uint16)(sv))
+#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
+  __sync_val_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),    \
+                              (kmp_uint32)(sv))
+#if KMP_ARCH_MIPS
+static inline bool mips_sync_bool_compare_and_swap(
+  volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
+  return __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
+                                                       __ATOMIC_SEQ_CST);
+}
+static inline bool mips_sync_val_compare_and_swap(
+  volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
+  __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
+                                                __ATOMIC_SEQ_CST);
+  return cv;
+}
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
+                               (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
+                               (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  mips_sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
+                              (kmp_uint64)(sv))
+#else
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
+                               (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
+                               (kmp_uint64)(sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  __sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),    \
+                              (kmp_uint64)(sv))
+#endif
+
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  __sync_lock_test_and_set((volatile kmp_uint8 *)(p), (kmp_uint8)(v))
+#define KMP_XCHG_FIXED16(p, v)                                                 \
+  __sync_lock_test_and_set((volatile kmp_uint16 *)(p), (kmp_uint16)(v))
+#define KMP_XCHG_FIXED32(p, v)                                                 \
+  __sync_lock_test_and_set((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_XCHG_FIXED64(p, v)                                                 \
+  __sync_lock_test_and_set((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+
+inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
+  kmp_int32 tmp =
+      __sync_lock_test_and_set((volatile kmp_uint32 *)(p), *(kmp_uint32 *)&v);
+  return *(kmp_real32 *)&tmp;
+}
+
+inline kmp_real64 KMP_XCHG_REAL64(volatile kmp_real64 *p, kmp_real64 v) {
+  kmp_int64 tmp =
+      __sync_lock_test_and_set((volatile kmp_uint64 *)(p), *(kmp_uint64 *)&v);
+  return *(kmp_real64 *)&tmp;
+}
+
+#else
+
+extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
+extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
+extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
+
+extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
+                                         kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
+                                           kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
+                                           kmp_int32 sv);
+extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
+                                           kmp_int64 sv);
+extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
+                                             kmp_int8 sv);
+extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
+                                               kmp_int16 cv, kmp_int16 sv);
+extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
+                                               kmp_int32 cv, kmp_int32 sv);
+extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
+                                               kmp_int64 cv, kmp_int64 sv);
+
+extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
+extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
+extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
+extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
+extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
+extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
+
+#define KMP_TEST_THEN_INC32(p)                                                 \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_INC_ACQ32(p)                                             \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
+#define KMP_TEST_THEN_INC64(p)                                                 \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_INC_ACQ64(p)                                             \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
+#define KMP_TEST_THEN_ADD4_32(p)                                               \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
+#define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
+#define KMP_TEST_THEN_ADD4_64(p)                                               \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
+#define KMP_TEST_THEN_DEC32(p)                                                 \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
+#define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
+#define KMP_TEST_THEN_DEC64(p)                                                 \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
+#define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
+#define KMP_TEST_THEN_ADD8(p, v)                                               \
+  __kmp_test_then_add8((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_ADD32(p, v)                                              \
+  __kmp_test_then_add32((volatile kmp_int32 *)(p), (kmp_int32)(v))
+#define KMP_TEST_THEN_ADD64(p, v)                                              \
+  __kmp_test_then_add64((volatile kmp_int64 *)(p), (kmp_int64)(v))
+
+#define KMP_TEST_THEN_OR8(p, v)                                                \
+  __kmp_test_then_or8((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_AND8(p, v)                                               \
+  __kmp_test_then_and8((volatile kmp_int8 *)(p), (kmp_int8)(v))
+#define KMP_TEST_THEN_OR32(p, v)                                               \
+  __kmp_test_then_or32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_TEST_THEN_AND32(p, v)                                              \
+  __kmp_test_then_and32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
+#define KMP_TEST_THEN_OR64(p, v)                                               \
+  __kmp_test_then_or64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+#define KMP_TEST_THEN_AND64(p, v)                                              \
+  __kmp_test_then_and64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
+
+#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv),           \
+                           (kmp_int8)(sv))
+#define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
+  __kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv),           \
+                           (kmp_int8)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv),        \
+                            (kmp_int16)(sv))
+#define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
+  __kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv),        \
+                            (kmp_int16)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+
+#if KMP_ARCH_X86
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
+                            (kmp_int32)(sv))
+#else /* 64 bit pointers */
+#define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
+  __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
+                            (kmp_int64)(sv))
+#endif /* KMP_ARCH_X86 */
+
+#define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
+  __kmp_compare_and_store_ret8((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret16((p), (cv), (sv))
+#define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret32((volatile kmp_int32 *)(p), (kmp_int32)(cv),    \
+                                (kmp_int32)(sv))
+#define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
+  __kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
+                                (kmp_int64)(sv))
+
+#define KMP_XCHG_FIXED8(p, v)                                                  \
+  __kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
+#define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
+#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
+#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
+#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
+#define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
+
+#endif /* KMP_ASM_INTRINS */
+
+/* ------------- relaxed consistency memory model stuff ------------------ */
+
+#if KMP_OS_WINDOWS
+#ifdef __ABSOFT_WIN
+#define KMP_MB() asm("nop")
+#define KMP_IMB() asm("nop")
+#else
+#define KMP_MB() /* _asm{ nop } */
+#define KMP_IMB() /* _asm{ nop } */
+#endif
+#endif /* KMP_OS_WINDOWS */
+
+#if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS ||     \
+    KMP_ARCH_MIPS64
+#define KMP_MB() __sync_synchronize()
+#endif
+
+#ifndef KMP_MB
+#define KMP_MB() /* nothing to do */
+#endif
+
+#ifndef KMP_IMB
+#define KMP_IMB() /* nothing to do */
+#endif
+
+#ifndef KMP_ST_REL32
+#define KMP_ST_REL32(A, D) (*(A) = (D))
+#endif
+
+#ifndef KMP_ST_REL64
+#define KMP_ST_REL64(A, D) (*(A) = (D))
+#endif
+
+#ifndef KMP_LD_ACQ32
+#define KMP_LD_ACQ32(A) (*(A))
+#endif
+
+#ifndef KMP_LD_ACQ64
+#define KMP_LD_ACQ64(A) (*(A))
+#endif
+
+/* ------------------------------------------------------------------------ */
+// FIXME - maybe this should this be
+//
+// #define TCR_4(a)    (*(volatile kmp_int32 *)(&a))
+// #define TCW_4(a,b)  (a) = (*(volatile kmp_int32 *)&(b))
+//
+// #define TCR_8(a)    (*(volatile kmp_int64 *)(a))
+// #define TCW_8(a,b)  (a) = (*(volatile kmp_int64 *)(&b))
+//
+// I'm fairly certain this is the correct thing to do, but I'm afraid
+// of performance regressions.
+
+#define TCR_1(a) (a)
+#define TCW_1(a, b) (a) = (b)
+#define TCR_4(a) (a)
+#define TCW_4(a, b) (a) = (b)
+#define TCI_4(a) (++(a))
+#define TCD_4(a) (--(a))
+#define TCR_8(a) (a)
+#define TCW_8(a, b) (a) = (b)
+#define TCI_8(a) (++(a))
+#define TCD_8(a) (--(a))
+#define TCR_SYNC_4(a) (a)
+#define TCW_SYNC_4(a, b) (a) = (b)
+#define TCX_SYNC_4(a, b, c)                                                    \
+  KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)(volatile void *)&(a),     \
+                              (kmp_int32)(b), (kmp_int32)(c))
+#define TCR_SYNC_8(a) (a)
+#define TCW_SYNC_8(a, b) (a) = (b)
+#define TCX_SYNC_8(a, b, c)                                                    \
+  KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)(volatile void *)&(a),     \
+                              (kmp_int64)(b), (kmp_int64)(c))
+
+#if KMP_ARCH_X86 || KMP_ARCH_MIPS
+// What about ARM?
+#define TCR_PTR(a) ((void *)TCR_4(a))
+#define TCW_PTR(a, b) TCW_4((a), (b))
+#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_4(a))
+#define TCW_SYNC_PTR(a, b) TCW_SYNC_4((a), (b))
+#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_4((a), (b), (c)))
+
+#else /* 64 bit pointers */
+
+#define TCR_PTR(a) ((void *)TCR_8(a))
+#define TCW_PTR(a, b) TCW_8((a), (b))
+#define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_8(a))
+#define TCW_SYNC_PTR(a, b) TCW_SYNC_8((a), (b))
+#define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_8((a), (b), (c)))
+
+#endif /* KMP_ARCH_X86 */
+
+/* If these FTN_{TRUE,FALSE} values change, may need to change several places
+   where they are used to check that language is Fortran, not C. */
+
+#ifndef FTN_TRUE
+#define FTN_TRUE TRUE
+#endif
+
+#ifndef FTN_FALSE
+#define FTN_FALSE FALSE
+#endif
+
+typedef void (*microtask_t)(int *gtid, int *npr, ...);
+
+#ifdef USE_VOLATILE_CAST
+#define VOLATILE_CAST(x) (volatile x)
+#else
+#define VOLATILE_CAST(x) (x)
+#endif
+
+#define KMP_WAIT __kmp_wait_4
+#define KMP_WAIT_PTR __kmp_wait_4_ptr
+#define KMP_EQ __kmp_eq_4
+#define KMP_NEQ __kmp_neq_4
+#define KMP_LT __kmp_lt_4
+#define KMP_GE __kmp_ge_4
+#define KMP_LE __kmp_le_4
+
+/* Workaround for Intel(R) 64 code gen bug when taking address of static array
+ * (Intel(R) 64 Tracker #138) */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_PPC64) && KMP_OS_LINUX
+#define STATIC_EFI2_WORKAROUND
+#else
+#define STATIC_EFI2_WORKAROUND static
+#endif
+
+// Support of BGET usage
+#ifndef KMP_USE_BGET
+#define KMP_USE_BGET 1
+#endif
+
+// Switches for OSS builds
+#ifndef USE_CMPXCHG_FIX
+#define USE_CMPXCHG_FIX 1
+#endif
+
+// Enable dynamic user lock
+#define KMP_USE_DYNAMIC_LOCK 1
+
+// Enable Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) if
+// dynamic user lock is turned on
+#if KMP_USE_DYNAMIC_LOCK
+// Visual studio can't handle the asm sections in this code
+#define KMP_USE_TSX (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_COMPILER_MSVC
+#ifdef KMP_USE_ADAPTIVE_LOCKS
+#undef KMP_USE_ADAPTIVE_LOCKS
+#endif
+#define KMP_USE_ADAPTIVE_LOCKS KMP_USE_TSX
+#endif
+
+// Enable tick time conversion of ticks to seconds
+#if KMP_STATS_ENABLED
+#define KMP_HAVE_TICK_TIME                                                     \
+  (KMP_OS_LINUX && (KMP_MIC || KMP_ARCH_X86 || KMP_ARCH_X86_64))
+#endif
+
+// Warning levels
+enum kmp_warnings_level {
+  kmp_warnings_off = 0, /* No warnings */
+  kmp_warnings_low, /* Minimal warnings (default) */
+  kmp_warnings_explicit = 6, /* Explicitly set to ON - more warnings */
+  kmp_warnings_verbose /* reserved */
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+// Macros for C++11 atomic functions
+#define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
+#define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)
+
+// For non-default load/store
+#define KMP_ATOMIC_LD_ACQ(p) KMP_ATOMIC_LD(p, acquire)
+#define KMP_ATOMIC_LD_RLX(p) KMP_ATOMIC_LD(p, relaxed)
+#define KMP_ATOMIC_ST_REL(p, v) KMP_ATOMIC_OP(store, p, v, release)
+#define KMP_ATOMIC_ST_RLX(p, v) KMP_ATOMIC_OP(store, p, v, relaxed)
+
+// For non-default fetch_<op>
+#define KMP_ATOMIC_ADD(p, v) KMP_ATOMIC_OP(fetch_add, p, v, acq_rel)
+#define KMP_ATOMIC_SUB(p, v) KMP_ATOMIC_OP(fetch_sub, p, v, acq_rel)
+#define KMP_ATOMIC_AND(p, v) KMP_ATOMIC_OP(fetch_and, p, v, acq_rel)
+#define KMP_ATOMIC_OR(p, v) KMP_ATOMIC_OP(fetch_or, p, v, acq_rel)
+#define KMP_ATOMIC_INC(p) KMP_ATOMIC_OP(fetch_add, p, 1, acq_rel)
+#define KMP_ATOMIC_DEC(p) KMP_ATOMIC_OP(fetch_sub, p, 1, acq_rel)
+#define KMP_ATOMIC_ADD_RLX(p, v) KMP_ATOMIC_OP(fetch_add, p, v, relaxed)
+#define KMP_ATOMIC_INC_RLX(p) KMP_ATOMIC_OP(fetch_add, p, 1, relaxed)
+
+// Callers of the following functions cannot see the side effect on "expected".
+template <typename T>
+bool __kmp_atomic_compare_store(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acq_rel, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_acq(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acquire, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_release, std::memory_order_relaxed);
+}
+
+#endif /* KMP_OS_H */
+// Safe C API
+#include "kmp_safe_c_api.h"
diff --git a/final/runtime/src/kmp_platform.h b/final/runtime/src/kmp_platform.h
new file mode 100644
index 0000000..e4f2e06
--- /dev/null
+++ b/final/runtime/src/kmp_platform.h
@@ -0,0 +1,206 @@
+/*
+ * kmp_platform.h -- header for determining operating system and architecture
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_PLATFORM_H
+#define KMP_PLATFORM_H
+
+/* ---------------------- Operating system recognition ------------------- */
+
+#define KMP_OS_LINUX 0
+#define KMP_OS_DRAGONFLY 0
+#define KMP_OS_FREEBSD 0
+#define KMP_OS_NETBSD 0
+#define KMP_OS_OPENBSD 0
+#define KMP_OS_DARWIN 0
+#define KMP_OS_WINDOWS 0
+#define KMP_OS_CNK 0
+#define KMP_OS_HURD 0
+#define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
+
+#ifdef _WIN32
+#undef KMP_OS_WINDOWS
+#define KMP_OS_WINDOWS 1
+#endif
+
+#if (defined __APPLE__ && defined __MACH__)
+#undef KMP_OS_DARWIN
+#define KMP_OS_DARWIN 1
+#endif
+
+// in some ppc64 linux installations, only the second condition is met
+#if (defined __linux)
+#undef KMP_OS_LINUX
+#define KMP_OS_LINUX 1
+#elif (defined __linux__)
+#undef KMP_OS_LINUX
+#define KMP_OS_LINUX 1
+#else
+#endif
+
+#if (defined __DragonFly__)
+#undef KMP_OS_DRAGONFLY
+#define KMP_OS_DRAGONFLY 1
+#endif
+
+#if (defined __FreeBSD__)
+#undef KMP_OS_FREEBSD
+#define KMP_OS_FREEBSD 1
+#endif
+
+#if (defined __NetBSD__)
+#undef KMP_OS_NETBSD
+#define KMP_OS_NETBSD 1
+#endif
+
+#if (defined __OpenBSD__)
+#undef KMP_OS_OPENBSD
+#define KMP_OS_OPENBSD 1
+#endif
+
+#if (defined __bgq__)
+#undef KMP_OS_CNK
+#define KMP_OS_CNK 1
+#endif
+
+#if (defined __GNU__)
+#undef KMP_OS_HURD
+#define KMP_OS_HURD 1
+#endif
+
+#if (1 !=                                                                      \
+     KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD +        \
+         KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD)
+#error Unknown OS
+#endif
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+        KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD
+#undef KMP_OS_UNIX
+#define KMP_OS_UNIX 1
+#endif
+
+/* ---------------------- Architecture recognition ------------------- */
+
+#define KMP_ARCH_X86 0
+#define KMP_ARCH_X86_64 0
+#define KMP_ARCH_AARCH64 0
+#define KMP_ARCH_PPC64_BE 0
+#define KMP_ARCH_PPC64_LE 0
+#define KMP_ARCH_PPC64 (KMP_ARCH_PPC64_LE || KMP_ARCH_PPC64_BE)
+#define KMP_ARCH_MIPS 0
+#define KMP_ARCH_MIPS64 0
+
+#if KMP_OS_WINDOWS
+#if defined(_M_AMD64) || defined(__x86_64)
+#undef KMP_ARCH_X86_64
+#define KMP_ARCH_X86_64 1
+#else
+#undef KMP_ARCH_X86
+#define KMP_ARCH_X86 1
+#endif
+#endif
+
+#if KMP_OS_UNIX
+#if defined __x86_64
+#undef KMP_ARCH_X86_64
+#define KMP_ARCH_X86_64 1
+#elif defined __i386
+#undef KMP_ARCH_X86
+#define KMP_ARCH_X86 1
+#elif defined __powerpc64__
+#if defined __LITTLE_ENDIAN__
+#undef KMP_ARCH_PPC64_LE
+#define KMP_ARCH_PPC64_LE 1
+#else
+#undef KMP_ARCH_PPC64_BE
+#define KMP_ARCH_PPC64_BE 1
+#endif
+#elif defined __aarch64__
+#undef KMP_ARCH_AARCH64
+#define KMP_ARCH_AARCH64 1
+#elif defined __mips__
+#if defined __mips64
+#undef KMP_ARCH_MIPS64
+#define KMP_ARCH_MIPS64 1
+#else
+#undef KMP_ARCH_MIPS
+#define KMP_ARCH_MIPS 1
+#endif
+#endif
+#endif
+
+#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) ||                     \
+    defined(__ARM_ARCH_7A__)
+#define KMP_ARCH_ARMV7 1
+#endif
+
+#if defined(KMP_ARCH_ARMV7) || defined(__ARM_ARCH_6__) ||                      \
+    defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) ||                    \
+    defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6T2__) ||                   \
+    defined(__ARM_ARCH_6ZK__)
+#define KMP_ARCH_ARMV6 1
+#endif
+
+#if defined(KMP_ARCH_ARMV6) || defined(__ARM_ARCH_5T__) ||                     \
+    defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) ||                   \
+    defined(__ARM_ARCH_5TEJ__)
+#define KMP_ARCH_ARMV5 1
+#endif
+
+#if defined(KMP_ARCH_ARMV5) || defined(__ARM_ARCH_4__) ||                      \
+    defined(__ARM_ARCH_4T__)
+#define KMP_ARCH_ARMV4 1
+#endif
+
+#if defined(KMP_ARCH_ARMV4) || defined(__ARM_ARCH_3__) ||                      \
+    defined(__ARM_ARCH_3M__)
+#define KMP_ARCH_ARMV3 1
+#endif
+
+#if defined(KMP_ARCH_ARMV3) || defined(__ARM_ARCH_2__)
+#define KMP_ARCH_ARMV2 1
+#endif
+
+#if defined(KMP_ARCH_ARMV2)
+#define KMP_ARCH_ARM 1
+#endif
+
+#if defined(__MIC__) || defined(__MIC2__)
+#define KMP_MIC 1
+#if __MIC2__ || __KNC__
+#define KMP_MIC1 0
+#define KMP_MIC2 1
+#else
+#define KMP_MIC1 1
+#define KMP_MIC2 0
+#endif
+#else
+#define KMP_MIC 0
+#define KMP_MIC1 0
+#define KMP_MIC2 0
+#endif
+
+/* Specify 32 bit architectures here */
+#define KMP_32_BIT_ARCH (KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS)
+
+// Platforms which support Intel(R) Many Integrated Core Architecture
+#define KMP_MIC_SUPPORTED                                                      \
+  ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS))
+
+// TODO: Fixme - This is clever, but really fugly
+#if (1 !=                                                                      \
+     KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 +          \
+         KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64)
+#error Unknown or unsupported architecture
+#endif
+
+#endif // KMP_PLATFORM_H
diff --git a/final/runtime/src/kmp_runtime.cpp b/final/runtime/src/kmp_runtime.cpp
new file mode 100644
index 0000000..7f6c149
--- /dev/null
+++ b/final/runtime/src/kmp_runtime.cpp
@@ -0,0 +1,8217 @@
+/*
+ * kmp_runtime.cpp -- KPTS runtime support library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_atomic.h"
+#include "kmp_environment.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_settings.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#include "kmp_wait_release.h"
+#include "kmp_wrapper_getpid.h"
+#include "kmp_dispatch.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/* these are temporary issues to be dealt with */
+#define KMP_USE_PRCTL 0
+
+#if KMP_OS_WINDOWS
+#include <process.h>
+#endif
+
+#include "tsan_annotations.h"
+
+#if defined(KMP_GOMP_COMPAT)
+char const __kmp_version_alt_comp[] =
+    KMP_VERSION_PREFIX "alternative compiler support: yes";
+#endif /* defined(KMP_GOMP_COMPAT) */
+
+char const __kmp_version_omp_api[] =
+    KMP_VERSION_PREFIX "API version: 5.0 (201611)";
+
+#ifdef KMP_DEBUG
+char const __kmp_version_lock[] =
+    KMP_VERSION_PREFIX "lock type: run time selectable";
+#endif /* KMP_DEBUG */
+
+#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_USE_MONITOR
+kmp_info_t __kmp_monitor;
+#endif
+
+/* Forward declarations */
+
+void __kmp_cleanup(void);
+
+static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
+                                  int gtid);
+static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
+                                  kmp_internal_control_t *new_icvs,
+                                  ident_t *loc);
+#if KMP_AFFINITY_SUPPORTED
+static void __kmp_partition_places(kmp_team_t *team,
+                                   int update_master_only = 0);
+#endif
+static void __kmp_do_serial_initialize(void);
+void __kmp_fork_barrier(int gtid, int tid);
+void __kmp_join_barrier(int gtid);
+void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
+                          kmp_internal_control_t *new_icvs, ident_t *loc);
+
+#ifdef USE_LOAD_BALANCE
+static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
+#endif
+
+static int __kmp_expand_threads(int nNeed);
+#if KMP_OS_WINDOWS
+static int __kmp_unregister_root_other_thread(int gtid);
+#endif
+static void __kmp_unregister_library(void); // called by __kmp_internal_end()
+static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
+kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
+
+/* Calculate the identifier of the current thread */
+/* fast (and somewhat portable) way to get unique identifier of executing
+   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
+int __kmp_get_global_thread_id() {
+  int i;
+  kmp_info_t **other_threads;
+  size_t stack_data;
+  char *stack_addr;
+  size_t stack_size;
+  char *stack_base;
+
+  KA_TRACE(
+      1000,
+      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
+       __kmp_nth, __kmp_all_nth));
+
+  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
+     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
+     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
+     __kmp_init_gtid for this to work. */
+
+  if (!TCR_4(__kmp_init_gtid))
+    return KMP_GTID_DNE;
+
+#ifdef KMP_TDATA_GTID
+  if (TCR_4(__kmp_gtid_mode) >= 3) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
+    return __kmp_gtid;
+  }
+#endif
+  if (TCR_4(__kmp_gtid_mode) >= 2) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
+    return __kmp_gtid_get_specific();
+  }
+  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
+
+  stack_addr = (char *)&stack_data;
+  other_threads = __kmp_threads;
+
+  /* ATT: The code below is a source of potential bugs due to unsynchronized
+     access to __kmp_threads array. For example:
+     1. Current thread loads other_threads[i] to thr and checks it, it is
+        non-NULL.
+     2. Current thread is suspended by OS.
+     3. Another thread unregisters and finishes (debug versions of free()
+        may fill memory with something like 0xEF).
+     4. Current thread is resumed.
+     5. Current thread reads junk from *thr.
+     TODO: Fix it.  --ln  */
+
+  for (i = 0; i < __kmp_threads_capacity; i++) {
+
+    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
+    if (!thr)
+      continue;
+
+    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
+    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
+
+    /* stack grows down -- search through all of the active threads */
+
+    if (stack_addr <= stack_base) {
+      size_t stack_diff = stack_base - stack_addr;
+
+      if (stack_diff <= stack_size) {
+        /* The only way we can be closer than the allocated */
+        /* stack size is if we are running on this thread. */
+        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
+        return i;
+      }
+    }
+  }
+
+  /* get specific to try and determine our gtid */
+  KA_TRACE(1000,
+           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
+            "thread, using TLS\n"));
+  i = __kmp_gtid_get_specific();
+
+  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
+
+  /* if we havn't been assigned a gtid, then return code */
+  if (i < 0)
+    return i;
+
+  /* dynamically updated stack window for uber threads to avoid get_specific
+     call */
+  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
+    KMP_FATAL(StackOverflow, i);
+  }
+
+  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
+  if (stack_addr > stack_base) {
+    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
+    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
+            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
+                stack_base);
+  } else {
+    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
+            stack_base - stack_addr);
+  }
+
+  /* Reprint stack bounds for ubermaster since they have been refined */
+  if (__kmp_storage_map) {
+    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
+    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
+    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
+                                 other_threads[i]->th.th_info.ds.ds_stacksize,
+                                 "th_%d stack (refinement)", i);
+  }
+  return i;
+}
+
+int __kmp_get_global_thread_id_reg() {
+  int gtid;
+
+  if (!__kmp_init_serial) {
+    gtid = KMP_GTID_DNE;
+  } else
+#ifdef KMP_TDATA_GTID
+      if (TCR_4(__kmp_gtid_mode) >= 3) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
+    gtid = __kmp_gtid;
+  } else
+#endif
+      if (TCR_4(__kmp_gtid_mode) >= 2) {
+    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
+    gtid = __kmp_gtid_get_specific();
+  } else {
+    KA_TRACE(1000,
+             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
+    gtid = __kmp_get_global_thread_id();
+  }
+
+  /* we must be a new uber master sibling thread */
+  if (gtid == KMP_GTID_DNE) {
+    KA_TRACE(10,
+             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
+              "Registering a new gtid.\n"));
+    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+    if (!__kmp_init_serial) {
+      __kmp_do_serial_initialize();
+      gtid = __kmp_gtid_get_specific();
+    } else {
+      gtid = __kmp_register_root(FALSE);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
+  }
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  return gtid;
+}
+
+/* caller must hold forkjoin_lock */
+void __kmp_check_stack_overlap(kmp_info_t *th) {
+  int f;
+  char *stack_beg = NULL;
+  char *stack_end = NULL;
+  int gtid;
+
+  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
+  if (__kmp_storage_map) {
+    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
+    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
+
+    gtid = __kmp_gtid_from_thread(th);
+
+    if (gtid == KMP_GTID_MONITOR) {
+      __kmp_print_storage_map_gtid(
+          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
+          "th_%s stack (%s)", "mon",
+          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
+    } else {
+      __kmp_print_storage_map_gtid(
+          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
+          "th_%d stack (%s)", gtid,
+          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
+    }
+  }
+
+  /* No point in checking ubermaster threads since they use refinement and
+   * cannot overlap */
+  gtid = __kmp_gtid_from_thread(th);
+  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
+    KA_TRACE(10,
+             ("__kmp_check_stack_overlap: performing extensive checking\n"));
+    if (stack_beg == NULL) {
+      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
+      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
+    }
+
+    for (f = 0; f < __kmp_threads_capacity; f++) {
+      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
+
+      if (f_th && f_th != th) {
+        char *other_stack_end =
+            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
+        char *other_stack_beg =
+            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
+        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
+            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
+
+          /* Print the other stack values before the abort */
+          if (__kmp_storage_map)
+            __kmp_print_storage_map_gtid(
+                -1, other_stack_beg, other_stack_end,
+                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
+                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
+
+          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
+                      __kmp_msg_null);
+        }
+      }
+    }
+  }
+  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_infinite_loop(void) {
+  static int done = FALSE;
+
+  while (!done) {
+    KMP_YIELD(TRUE);
+  }
+}
+
+#define MAX_MESSAGE 512
+
+void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
+                                  char const *format, ...) {
+  char buffer[MAX_MESSAGE];
+  va_list ap;
+
+  va_start(ap, format);
+  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
+               p2, (unsigned long)size, format);
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(kmp_err, buffer, ap);
+#if KMP_PRINT_DATA_PLACEMENT
+  int node;
+  if (gtid >= 0) {
+    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
+      if (__kmp_storage_map_verbose) {
+        node = __kmp_get_host_node(p1);
+        if (node < 0) /* doesn't work, so don't try this next time */
+          __kmp_storage_map_verbose = FALSE;
+        else {
+          char *last;
+          int lastNode;
+          int localProc = __kmp_get_cpu_from_gtid(gtid);
+
+          const int page_size = KMP_GET_PAGE_SIZE();
+
+          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
+          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
+          if (localProc >= 0)
+            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
+                                 localProc >> 1);
+          else
+            __kmp_printf_no_lock("  GTID %d\n", gtid);
+#if KMP_USE_PRCTL
+          /* The more elaborate format is disabled for now because of the prctl
+           * hanging bug. */
+          do {
+            last = p1;
+            lastNode = node;
+            /* This loop collates adjacent pages with the same host node. */
+            do {
+              (char *)p1 += page_size;
+            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
+            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
+                                 lastNode);
+          } while (p1 <= p2);
+#else
+          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
+                               (char *)p1 + (page_size - 1),
+                               __kmp_get_host_node(p1));
+          if (p1 < p2) {
+            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
+                                 (char *)p2 + (page_size - 1),
+                                 __kmp_get_host_node(p2));
+          }
+#endif
+        }
+      }
+    } else
+      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
+  }
+#endif /* KMP_PRINT_DATA_PLACEMENT */
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+}
+
+void __kmp_warn(char const *format, ...) {
+  char buffer[MAX_MESSAGE];
+  va_list ap;
+
+  if (__kmp_generate_warnings == kmp_warnings_off) {
+    return;
+  }
+
+  va_start(ap, format);
+
+  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
+  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_vprintf(kmp_err, buffer, ap);
+  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
+
+  va_end(ap);
+}
+
+void __kmp_abort_process() {
+  // Later threads may stall here, but that's ok because abort() will kill them.
+  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
+
+  if (__kmp_debug_buf) {
+    __kmp_dump_debug_buffer();
+  }
+
+  if (KMP_OS_WINDOWS) {
+    // Let other threads know of abnormal termination and prevent deadlock
+    // if abort happened during library initialization or shutdown
+    __kmp_global.g.g_abort = SIGABRT;
+
+    /* On Windows* OS by default abort() causes pop-up error box, which stalls
+       nightly testing. Unfortunately, we cannot reliably suppress pop-up error
+       boxes. _set_abort_behavior() works well, but this function is not
+       available in VS7 (this is not problem for DLL, but it is a problem for
+       static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
+       help, at least in some versions of MS C RTL.
+
+       It seems following sequence is the only way to simulate abort() and
+       avoid pop-up error box. */
+    raise(SIGABRT);
+    _exit(3); // Just in case, if signal ignored, exit anyway.
+  } else {
+    abort();
+  }
+
+  __kmp_infinite_loop();
+  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
+
+} // __kmp_abort_process
+
+void __kmp_abort_thread(void) {
+  // TODO: Eliminate g_abort global variable and this function.
+  // In case of abort just call abort(), it will kill all the threads.
+  __kmp_infinite_loop();
+} // __kmp_abort_thread
+
+/* Print out the storage map for the major kmp_info_t thread data structures
+   that are allocated together. */
+
+static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
+  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
+                               gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
+                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
+                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
+
+  __kmp_print_storage_map_gtid(
+      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
+      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
+                               &thr->th.th_bar[bs_plain_barrier + 1],
+                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
+                               gtid);
+
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
+                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
+                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
+                               gtid);
+
+#if KMP_FAST_REDUCTION_BARRIER
+  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
+                               &thr->th.th_bar[bs_reduction_barrier + 1],
+                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
+                               gtid);
+#endif // KMP_FAST_REDUCTION_BARRIER
+}
+
+/* Print out the storage map for the major kmp_team_t team data structures
+   that are allocated together. */
+
+static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
+                                         int team_id, int num_thr) {
+  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
+  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
+                               header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
+                               &team->t.t_bar[bs_last_barrier],
+                               sizeof(kmp_balign_team_t) * bs_last_barrier,
+                               "%s_%d.t_bar", header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
+                               &team->t.t_bar[bs_plain_barrier + 1],
+                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
+                               header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
+                               &team->t.t_bar[bs_forkjoin_barrier + 1],
+                               sizeof(kmp_balign_team_t),
+                               "%s_%d.t_bar[forkjoin]", header, team_id);
+
+#if KMP_FAST_REDUCTION_BARRIER
+  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
+                               &team->t.t_bar[bs_reduction_barrier + 1],
+                               sizeof(kmp_balign_team_t),
+                               "%s_%d.t_bar[reduction]", header, team_id);
+#endif // KMP_FAST_REDUCTION_BARRIER
+
+  __kmp_print_storage_map_gtid(
+      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
+      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
+
+  __kmp_print_storage_map_gtid(
+      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
+      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
+
+  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
+                               &team->t.t_disp_buffer[num_disp_buff],
+                               sizeof(dispatch_shared_info_t) * num_disp_buff,
+                               "%s_%d.t_disp_buffer", header, team_id);
+}
+
+static void __kmp_init_allocator() { __kmp_init_memkind(); }
+static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
+
+/* ------------------------------------------------------------------------ */
+
+#if KMP_DYNAMIC_LIB
+#if KMP_OS_WINDOWS
+
+static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
+  // TODO: Change to __kmp_break_bootstrap_lock().
+  __kmp_init_bootstrap_lock(lck); // make the lock released
+}
+
+static void __kmp_reset_locks_on_process_detach(int gtid_req) {
+  int i;
+  int thread_count;
+
+  // PROCESS_DETACH is expected to be called by a thread that executes
+  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
+  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
+  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
+  // threads can be still alive here, although being about to be terminated. The
+  // threads in the array with ds_thread==0 are most suspicious. Actually, it
+  // can be not safe to access the __kmp_threads[].
+
+  // TODO: does it make sense to check __kmp_roots[] ?
+
+  // Let's check that there are no other alive threads registered with the OMP
+  // lib.
+  while (1) {
+    thread_count = 0;
+    for (i = 0; i < __kmp_threads_capacity; ++i) {
+      if (!__kmp_threads)
+        continue;
+      kmp_info_t *th = __kmp_threads[i];
+      if (th == NULL)
+        continue;
+      int gtid = th->th.th_info.ds.ds_gtid;
+      if (gtid == gtid_req)
+        continue;
+      if (gtid < 0)
+        continue;
+      DWORD exit_val;
+      int alive = __kmp_is_thread_alive(th, &exit_val);
+      if (alive) {
+        ++thread_count;
+      }
+    }
+    if (thread_count == 0)
+      break; // success
+  }
+
+  // Assume that I'm alone. Now it might be safe to check and reset locks.
+  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
+  __kmp_reset_lock(&__kmp_forkjoin_lock);
+#ifdef KMP_DEBUG
+  __kmp_reset_lock(&__kmp_stdio_lock);
+#endif // KMP_DEBUG
+}
+
+BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
+  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
+
+  switch (fdwReason) {
+
+  case DLL_PROCESS_ATTACH:
+    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
+
+    return TRUE;
+
+  case DLL_PROCESS_DETACH:
+    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
+
+    if (lpReserved != NULL) {
+      // lpReserved is used for telling the difference:
+      //   lpReserved == NULL when FreeLibrary() was called,
+      //   lpReserved != NULL when the process terminates.
+      // When FreeLibrary() is called, worker threads remain alive. So they will
+      // release the forkjoin lock by themselves. When the process terminates,
+      // worker threads disappear triggering the problem of unreleased forkjoin
+      // lock as described below.
+
+      // A worker thread can take the forkjoin lock. The problem comes up if
+      // that worker thread becomes dead before it releases the forkjoin lock.
+      // The forkjoin lock remains taken, while the thread executing
+      // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
+      // to take the forkjoin lock and will always fail, so that the application
+      // will never finish [normally]. This scenario is possible if
+      // __kmpc_end() has not been executed. It looks like it's not a corner
+      // case, but common cases:
+      // - the main function was compiled by an alternative compiler;
+      // - the main function was compiled by icl but without /Qopenmp
+      //   (application with plugins);
+      // - application terminates by calling C exit(), Fortran CALL EXIT() or
+      //   Fortran STOP.
+      // - alive foreign thread prevented __kmpc_end from doing cleanup.
+      //
+      // This is a hack to work around the problem.
+      // TODO: !!! figure out something better.
+      __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
+    }
+
+    __kmp_internal_end_library(__kmp_gtid_get_specific());
+
+    return TRUE;
+
+  case DLL_THREAD_ATTACH:
+    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
+
+    /* if we want to register new siblings all the time here call
+     * __kmp_get_gtid(); */
+    return TRUE;
+
+  case DLL_THREAD_DETACH:
+    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
+
+    __kmp_internal_end_thread(__kmp_gtid_get_specific());
+    return TRUE;
+  }
+
+  return TRUE;
+}
+
+#endif /* KMP_OS_WINDOWS */
+#endif /* KMP_DYNAMIC_LIB */
+
+/* __kmp_parallel_deo -- Wait until it's our turn. */
+void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  int gtid = *gtid_ref;
+#ifdef BUILD_PARALLEL_ORDERED
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+#endif /* BUILD_PARALLEL_ORDERED */
+
+  if (__kmp_env_consistency_check) {
+    if (__kmp_threads[gtid]->th.th_root->r.r_active)
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
+#endif
+  }
+#ifdef BUILD_PARALLEL_ORDERED
+  if (!team->t.t_serialized) {
+    KMP_MB();
+    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
+             NULL);
+    KMP_MB();
+  }
+#endif /* BUILD_PARALLEL_ORDERED */
+}
+
+/* __kmp_parallel_dxo -- Signal the next task. */
+void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  int gtid = *gtid_ref;
+#ifdef BUILD_PARALLEL_ORDERED
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_team_t *team = __kmp_team_from_gtid(gtid);
+#endif /* BUILD_PARALLEL_ORDERED */
+
+  if (__kmp_env_consistency_check) {
+    if (__kmp_threads[gtid]->th.th_root->r.r_active)
+      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
+  }
+#ifdef BUILD_PARALLEL_ORDERED
+  if (!team->t.t_serialized) {
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    /* use the tid of the next thread in this team */
+    /* TODO replace with general release procedure */
+    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+#endif /* BUILD_PARALLEL_ORDERED */
+}
+
+/* ------------------------------------------------------------------------ */
+/* The BARRIER for a SINGLE process section is always explicit   */
+
+int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
+  int status;
+  kmp_info_t *th;
+  kmp_team_t *team;
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  status = 0;
+
+  th->th.th_ident = id_ref;
+
+  if (team->t.t_serialized) {
+    status = 1;
+  } else {
+    kmp_int32 old_this = th->th.th_local.this_construct;
+
+    ++th->th.th_local.this_construct;
+    /* try to set team count to thread count--success means thread got the
+       single block */
+    /* TODO: Should this be acquire or release? */
+    if (team->t.t_construct == old_this) {
+      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
+                                              th->th.th_local.this_construct);
+    }
+#if USE_ITT_BUILD
+    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
+        KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
+        team->t.t_active_level ==
+            1) { // Only report metadata by master of active team at level 1
+      __kmp_itt_metadata_single(id_ref);
+    }
+#endif /* USE_ITT_BUILD */
+  }
+
+  if (__kmp_env_consistency_check) {
+    if (status && push_ws) {
+      __kmp_push_workshare(gtid, ct_psingle, id_ref);
+    } else {
+      __kmp_check_workshare(gtid, ct_psingle, id_ref);
+    }
+  }
+#if USE_ITT_BUILD
+  if (status) {
+    __kmp_itt_single_start(gtid);
+  }
+#endif /* USE_ITT_BUILD */
+  return status;
+}
+
+void __kmp_exit_single(int gtid) {
+#if USE_ITT_BUILD
+  __kmp_itt_single_end(gtid);
+#endif /* USE_ITT_BUILD */
+  if (__kmp_env_consistency_check)
+    __kmp_pop_workshare(gtid, ct_psingle, NULL);
+}
+
+/* determine if we can go parallel or must use a serialized parallel region and
+ * how many threads we can use
+ * set_nproc is the number of threads requested for the team
+ * returns 0 if we should serialize or only use one thread,
+ * otherwise the number of threads to use
+ * The forkjoin lock is held by the caller. */
+static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
+                                 int master_tid, int set_nthreads,
+                                 int enter_teams) {
+  int capacity;
+  int new_nthreads;
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  KMP_DEBUG_ASSERT(root && parent_team);
+  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
+
+  // If dyn-var is set, dynamically adjust the number of desired threads,
+  // according to the method specified by dynamic_mode.
+  new_nthreads = set_nthreads;
+  if (!get__dynamic_2(parent_team, master_tid)) {
+    ;
+  }
+#ifdef USE_LOAD_BALANCE
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
+    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
+    if (new_nthreads == 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
+                    "reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    if (new_nthreads < set_nthreads) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
+                    "reservation to %d threads\n",
+                    master_tid, new_nthreads));
+    }
+  }
+#endif /* USE_LOAD_BALANCE */
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
+    new_nthreads = __kmp_avail_proc - __kmp_nth +
+                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (new_nthreads <= 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
+                    "reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    if (new_nthreads < set_nthreads) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
+                    "reservation to %d threads\n",
+                    master_tid, new_nthreads));
+    } else {
+      new_nthreads = set_nthreads;
+    }
+  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
+    if (set_nthreads > 2) {
+      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
+      new_nthreads = (new_nthreads % set_nthreads) + 1;
+      if (new_nthreads == 1) {
+        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
+                      "reservation to 1 thread\n",
+                      master_tid));
+        return 1;
+      }
+      if (new_nthreads < set_nthreads) {
+        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
+                      "reservation to %d threads\n",
+                      master_tid, new_nthreads));
+      }
+    }
+  } else {
+    KMP_ASSERT(0);
+  }
+
+  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
+  if (__kmp_nth + new_nthreads -
+          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
+      __kmp_max_nth) {
+    int tl_nthreads = __kmp_max_nth - __kmp_nth +
+                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (tl_nthreads <= 0) {
+      tl_nthreads = 1;
+    }
+
+    // If dyn-var is false, emit a 1-time warning.
+    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
+      __kmp_reserve_warn = 1;
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
+                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+    }
+    if (tl_nthreads == 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
+                    "reduced reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
+                  "reservation to %d threads\n",
+                  master_tid, tl_nthreads));
+    new_nthreads = tl_nthreads;
+  }
+
+  // Respect OMP_THREAD_LIMIT
+  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
+  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
+  if (cg_nthreads + new_nthreads -
+          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
+      max_cg_threads) {
+    int tl_nthreads = max_cg_threads - cg_nthreads +
+                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (tl_nthreads <= 0) {
+      tl_nthreads = 1;
+    }
+
+    // If dyn-var is false, emit a 1-time warning.
+    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
+      __kmp_reserve_warn = 1;
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
+                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+    }
+    if (tl_nthreads == 1) {
+      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
+                    "reduced reservation to 1 thread\n",
+                    master_tid));
+      return 1;
+    }
+    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
+                  "reservation to %d threads\n",
+                  master_tid, tl_nthreads));
+    new_nthreads = tl_nthreads;
+  }
+
+  // Check if the threads array is large enough, or needs expanding.
+  // See comment in __kmp_register_root() about the adjustment if
+  // __kmp_threads[0] == NULL.
+  capacity = __kmp_threads_capacity;
+  if (TCR_PTR(__kmp_threads[0]) == NULL) {
+    --capacity;
+  }
+  if (__kmp_nth + new_nthreads -
+          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
+      capacity) {
+    // Expand the threads array.
+    int slotsRequired = __kmp_nth + new_nthreads -
+                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
+                        capacity;
+    int slotsAdded = __kmp_expand_threads(slotsRequired);
+    if (slotsAdded < slotsRequired) {
+      // The threads array was not expanded enough.
+      new_nthreads -= (slotsRequired - slotsAdded);
+      KMP_ASSERT(new_nthreads >= 1);
+
+      // If dyn-var is false, emit a 1-time warning.
+      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
+        __kmp_reserve_warn = 1;
+        if (__kmp_tp_cached) {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
+                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
+                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
+        } else {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
+                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
+        }
+      }
+    }
+  }
+
+#ifdef KMP_DEBUG
+  if (new_nthreads == 1) {
+    KC_TRACE(10,
+             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
+              "dead roots and rechecking; requested %d threads\n",
+              __kmp_get_gtid(), set_nthreads));
+  } else {
+    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
+                  " %d threads\n",
+                  __kmp_get_gtid(), new_nthreads, set_nthreads));
+  }
+#endif // KMP_DEBUG
+  return new_nthreads;
+}
+
+/* Allocate threads from the thread pool and assign them to the new team. We are
+   assured that there are enough threads available, because we checked on that
+   earlier within critical section forkjoin */
+static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
+                                    kmp_info_t *master_th, int master_gtid) {
+  int i;
+  int use_hot_team;
+
+  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
+  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
+  KMP_MB();
+
+  /* first, let's setup the master thread */
+  master_th->th.th_info.ds.ds_tid = 0;
+  master_th->th.th_team = team;
+  master_th->th.th_team_nproc = team->t.t_nproc;
+  master_th->th.th_team_master = master_th;
+  master_th->th.th_team_serialized = FALSE;
+  master_th->th.th_dispatch = &team->t.t_dispatch[0];
+
+/* make sure we are not the optimized hot team */
+#if KMP_NESTED_HOT_TEAMS
+  use_hot_team = 0;
+  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
+  if (hot_teams) { // hot teams array is not allocated if
+    // KMP_HOT_TEAMS_MAX_LEVEL=0
+    int level = team->t.t_active_level - 1; // index in array of hot teams
+    if (master_th->th.th_teams_microtask) { // are we inside the teams?
+      if (master_th->th.th_teams_size.nteams > 1) {
+        ++level; // level was not increased in teams construct for
+        // team_of_masters
+      }
+      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+          master_th->th.th_teams_level == team->t.t_level) {
+        ++level; // level was not increased in teams construct for
+        // team_of_workers before the parallel
+      } // team->t.t_level will be increased inside parallel
+    }
+    if (level < __kmp_hot_teams_max_level) {
+      if (hot_teams[level].hot_team) {
+        // hot team has already been allocated for given level
+        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
+        use_hot_team = 1; // the team is ready to use
+      } else {
+        use_hot_team = 0; // AC: threads are not allocated yet
+        hot_teams[level].hot_team = team; // remember new hot team
+        hot_teams[level].hot_team_nth = team->t.t_nproc;
+      }
+    } else {
+      use_hot_team = 0;
+    }
+  }
+#else
+  use_hot_team = team == root->r.r_hot_team;
+#endif
+  if (!use_hot_team) {
+
+    /* install the master thread */
+    team->t.t_threads[0] = master_th;
+    __kmp_initialize_info(master_th, team, 0, master_gtid);
+
+    /* now, install the worker threads */
+    for (i = 1; i < team->t.t_nproc; i++) {
+
+      /* fork or reallocate a new thread and install it in team */
+      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
+      team->t.t_threads[i] = thr;
+      KMP_DEBUG_ASSERT(thr);
+      KMP_DEBUG_ASSERT(thr->th.th_team == team);
+      /* align team and thread arrived states */
+      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
+                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
+                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
+                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
+                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
+                    team->t.t_bar[bs_plain_barrier].b_arrived));
+      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
+      thr->th.th_teams_level = master_th->th.th_teams_level;
+      thr->th.th_teams_size = master_th->th.th_teams_size;
+      { // Initialize threads' barrier data.
+        int b;
+        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+      }
+    }
+
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_partition_places(team);
+#endif
+  }
+
+  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
+    for (i = 0; i < team->t.t_nproc; i++) {
+      kmp_info_t *thr = team->t.t_threads[i];
+      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
+          thr->th.th_prev_level != team->t.t_level) {
+        team->t.t_display_affinity = 1;
+        break;
+      }
+    }
+  }
+
+  KMP_MB();
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// Propagate any changes to the floating point control registers out to the team
+// We try to avoid unnecessary writes to the relevant cache line in the team
+// structure, so we don't make changes unless they are needed.
+inline static void propagateFPControl(kmp_team_t *team) {
+  if (__kmp_inherit_fp_control) {
+    kmp_int16 x87_fpu_control_word;
+    kmp_uint32 mxcsr;
+
+    // Get master values of FPU control flags (both X87 and vector)
+    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
+    __kmp_store_mxcsr(&mxcsr);
+    mxcsr &= KMP_X86_MXCSR_MASK;
+
+    // There is no point looking at t_fp_control_saved here.
+    // If it is TRUE, we still have to update the values if they are different
+    // from those we now have. If it is FALSE we didn't save anything yet, but
+    // our objective is the same. We have to ensure that the values in the team
+    // are the same as those we have.
+    // So, this code achieves what we need whether or not t_fp_control_saved is
+    // true. By checking whether the value needs updating we avoid unnecessary
+    // writes that would put the cache-line into a written state, causing all
+    // threads in the team to have to read it again.
+    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
+    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
+    // Although we don't use this value, other code in the runtime wants to know
+    // whether it should restore them. So we must ensure it is correct.
+    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
+  } else {
+    // Similarly here. Don't write to this cache-line in the team structure
+    // unless we have to.
+    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
+  }
+}
+
+// Do the opposite, setting the hardware registers to the updated values from
+// the team.
+inline static void updateHWFPControl(kmp_team_t *team) {
+  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
+    // Only reset the fp control regs if they have been changed in the team.
+    // the parallel region that we are exiting.
+    kmp_int16 x87_fpu_control_word;
+    kmp_uint32 mxcsr;
+    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
+    __kmp_store_mxcsr(&mxcsr);
+    mxcsr &= KMP_X86_MXCSR_MASK;
+
+    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
+      __kmp_clear_x87_fpu_status_word();
+      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
+    }
+
+    if (team->t.t_mxcsr != mxcsr) {
+      __kmp_load_mxcsr(&team->t.t_mxcsr);
+    }
+  }
+}
+#else
+#define propagateFPControl(x) ((void)0)
+#define updateHWFPControl(x) ((void)0)
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
+                                     int realloc); // forward declaration
+
+/* Run a parallel region that has been serialized, so runs only in a team of the
+   single master thread. */
+void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
+  kmp_info_t *this_thr;
+  kmp_team_t *serial_team;
+
+  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
+
+  /* Skip all this code for autopar serialized loops since it results in
+     unacceptable overhead */
+  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
+    return;
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  __kmp_resume_if_soft_paused();
+
+  this_thr = __kmp_threads[global_tid];
+  serial_team = this_thr->th.th_serial_team;
+
+  /* utilize the serialized team held by this thread */
+  KMP_DEBUG_ASSERT(serial_team);
+  KMP_MB();
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    KMP_DEBUG_ASSERT(
+        this_thr->th.th_task_team ==
+        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
+    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
+                     NULL);
+    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
+                  "team %p, new task_team = NULL\n",
+                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
+    this_thr->th.th_task_team = NULL;
+  }
+
+  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
+  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+    proc_bind = proc_bind_false;
+  } else if (proc_bind == proc_bind_default) {
+    // No proc_bind clause was specified, so use the current value
+    // of proc-bind-var for this parallel region.
+    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
+  }
+  // Reset for next parallel region
+  this_thr->th.th_set_proc_bind = proc_bind_default;
+
+#if OMPT_SUPPORT
+  ompt_data_t ompt_parallel_data = ompt_data_none;
+  ompt_data_t *implicit_task_data;
+  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
+
+    ompt_task_info_t *parent_task_info;
+    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
+
+    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_parallel_begin) {
+      int team_size = 1;
+
+      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
+          &(parent_task_info->task_data), &(parent_task_info->frame),
+          &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
+          codeptr);
+    }
+  }
+#endif // OMPT_SUPPORT
+
+  if (this_thr->th.th_team != serial_team) {
+    // Nested level will be an index in the nested nthreads array
+    int level = this_thr->th.th_team->t.t_level;
+
+    if (serial_team->t.t_serialized) {
+      /* this serial team was already used
+         TODO increase performance by making this locks more specific */
+      kmp_team_t *new_team;
+
+      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+      new_team =
+          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
+#if OMPT_SUPPORT
+                              ompt_parallel_data,
+#endif
+                              proc_bind, &this_thr->th.th_current_task->td_icvs,
+                              0 USE_NESTED_HOT_ARG(NULL));
+      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+      KMP_ASSERT(new_team);
+
+      /* setup new serialized team and install it */
+      new_team->t.t_threads[0] = this_thr;
+      new_team->t.t_parent = this_thr->th.th_team;
+      serial_team = new_team;
+      this_thr->th.th_serial_team = serial_team;
+
+      KF_TRACE(
+          10,
+          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
+           global_tid, serial_team));
+
+      /* TODO the above breaks the requirement that if we run out of resources,
+         then we can still guarantee that serialized teams are ok, since we may
+         need to allocate a new one */
+    } else {
+      KF_TRACE(
+          10,
+          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
+           global_tid, serial_team));
+    }
+
+    /* we have to initialize this serial team */
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
+    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
+    serial_team->t.t_ident = loc;
+    serial_team->t.t_serialized = 1;
+    serial_team->t.t_nproc = 1;
+    serial_team->t.t_parent = this_thr->th.th_team;
+    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
+    this_thr->th.th_team = serial_team;
+    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
+
+    KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
+                  this_thr->th.th_current_task));
+    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
+    this_thr->th.th_current_task->td_flags.executing = 0;
+
+    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
+
+    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
+       implicit task for each serialized task represented by
+       team->t.t_serialized? */
+    copy_icvs(&this_thr->th.th_current_task->td_icvs,
+              &this_thr->th.th_current_task->td_parent->td_icvs);
+
+    // Thread value exists in the nested nthreads array for the next nested
+    // level
+    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
+      this_thr->th.th_current_task->td_icvs.nproc =
+          __kmp_nested_nth.nth[level + 1];
+    }
+
+    if (__kmp_nested_proc_bind.used &&
+        (level + 1 < __kmp_nested_proc_bind.used)) {
+      this_thr->th.th_current_task->td_icvs.proc_bind =
+          __kmp_nested_proc_bind.bind_types[level + 1];
+    }
+
+#if USE_DEBUGGER
+    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
+#endif
+    this_thr->th.th_info.ds.ds_tid = 0;
+
+    /* set thread cache values */
+    this_thr->th.th_team_nproc = 1;
+    this_thr->th.th_team_master = this_thr;
+    this_thr->th.th_team_serialized = 1;
+
+    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
+    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
+    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
+
+    propagateFPControl(serial_team);
+
+    /* check if we need to allocate dispatch buffers stack */
+    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+    if (!serial_team->t.t_dispatch->th_disp_buffer) {
+      serial_team->t.t_dispatch->th_disp_buffer =
+          (dispatch_private_info_t *)__kmp_allocate(
+              sizeof(dispatch_private_info_t));
+    }
+    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+    KMP_MB();
+
+  } else {
+    /* this serialized team is already being used,
+     * that's fine, just add another nested level */
+    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
+    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
+    ++serial_team->t.t_serialized;
+    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
+
+    // Nested level will be an index in the nested nthreads array
+    int level = this_thr->th.th_team->t.t_level;
+    // Thread value exists in the nested nthreads array for the next nested
+    // level
+    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
+      this_thr->th.th_current_task->td_icvs.nproc =
+          __kmp_nested_nth.nth[level + 1];
+    }
+    serial_team->t.t_level++;
+    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
+                  "of serial team %p to %d\n",
+                  global_tid, serial_team, serial_team->t.t_level));
+
+    /* allocate/push dispatch buffers stack */
+    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+    {
+      dispatch_private_info_t *disp_buffer =
+          (dispatch_private_info_t *)__kmp_allocate(
+              sizeof(dispatch_private_info_t));
+      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
+      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
+    }
+    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+    KMP_MB();
+  }
+  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
+
+  // Perform the display affinity functionality for
+  // serialized parallel regions
+  if (__kmp_display_affinity) {
+    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
+        this_thr->th.th_prev_num_threads != 1) {
+      // NULL means use the affinity-format-var ICV
+      __kmp_aux_display_affinity(global_tid, NULL);
+      this_thr->th.th_prev_level = serial_team->t.t_level;
+      this_thr->th.th_prev_num_threads = 1;
+    }
+  }
+
+  if (__kmp_env_consistency_check)
+    __kmp_push_parallel(global_tid, NULL);
+#if OMPT_SUPPORT
+  serial_team->t.ompt_team_info.master_return_address = codeptr;
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+
+    ompt_lw_taskteam_t lw_taskteam;
+    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
+                            &ompt_parallel_data, codeptr);
+
+    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
+    // don't use lw_taskteam after linking. content was swaped
+
+    /* OMPT implicit task begin */
+    implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
+          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+      OMPT_CUR_TASK_INFO(this_thr)
+          ->thread_num = __kmp_tid_from_gtid(global_tid);
+    }
+
+    /* OMPT state */
+    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+}
+
+/* most of the work for a fork */
+/* return true if we really went parallel, false if serialized */
+int __kmp_fork_call(ident_t *loc, int gtid,
+                    enum fork_context_e call_context, // Intel, GNU, ...
+                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+                    va_list *ap
+#else
+                    va_list ap
+#endif
+                    ) {
+  void **argv;
+  int i;
+  int master_tid;
+  int master_this_cons;
+  kmp_team_t *team;
+  kmp_team_t *parent_team;
+  kmp_info_t *master_th;
+  kmp_root_t *root;
+  int nthreads;
+  int master_active;
+  int master_set_numthreads;
+  int level;
+  int active_level;
+  int teams_level;
+#if KMP_NESTED_HOT_TEAMS
+  kmp_hot_team_ptr_t **p_hot_teams;
+#endif
+  { // KMP_TIME_BLOCK
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
+    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
+
+    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
+    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
+      /* Some systems prefer the stack for the root thread(s) to start with */
+      /* some gap from the parent stack to prevent false sharing. */
+      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
+      /* These 2 lines below are so this does not get optimized out */
+      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
+        __kmp_stkpadding += (short)((kmp_int64)dummy);
+    }
+
+    /* initialize if needed */
+    KMP_DEBUG_ASSERT(
+        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
+    if (!TCR_4(__kmp_init_parallel))
+      __kmp_parallel_initialize();
+    __kmp_resume_if_soft_paused();
+
+    /* setup current data */
+    master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
+    // shutdown
+    parent_team = master_th->th.th_team;
+    master_tid = master_th->th.th_info.ds.ds_tid;
+    master_this_cons = master_th->th.th_local.this_construct;
+    root = master_th->th.th_root;
+    master_active = root->r.r_active;
+    master_set_numthreads = master_th->th.th_set_nproc;
+
+#if OMPT_SUPPORT
+    ompt_data_t ompt_parallel_data = ompt_data_none;
+    ompt_data_t *parent_task_data;
+    ompt_frame_t *ompt_frame;
+    ompt_data_t *implicit_task_data;
+    void *return_address = NULL;
+
+    if (ompt_enabled.enabled) {
+      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
+                                    NULL, NULL);
+      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    }
+#endif
+
+    // Nested level will be an index in the nested nthreads array
+    level = parent_team->t.t_level;
+    // used to launch non-serial teams even if nested is not allowed
+    active_level = parent_team->t.t_active_level;
+    // needed to check nesting inside the teams
+    teams_level = master_th->th.th_teams_level;
+#if KMP_NESTED_HOT_TEAMS
+    p_hot_teams = &master_th->th.th_hot_teams;
+    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
+      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
+          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
+      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
+      // it is either actual or not needed (when active_level > 0)
+      (*p_hot_teams)[0].hot_team_nth = 1;
+    }
+#endif
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      if (ompt_enabled.ompt_callback_parallel_begin) {
+        int team_size = master_set_numthreads
+                            ? master_set_numthreads
+                            : get__nproc_2(parent_team, master_tid);
+        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
+            parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
+            OMPT_INVOKER(call_context), return_address);
+      }
+      master_th->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
+    master_th->th.th_ident = loc;
+
+    if (master_th->th.th_teams_microtask && ap &&
+        microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
+      // AC: This is start of parallel that is nested inside teams construct.
+      // The team is actual (hot), all workers are ready at the fork barrier.
+      // No lock needed to initialize the team a bit, then free workers.
+      parent_team->t.t_ident = loc;
+      __kmp_alloc_argv_entries(argc, parent_team, TRUE);
+      parent_team->t.t_argc = argc;
+      argv = (void **)parent_team->t.t_argv;
+      for (i = argc - 1; i >= 0; --i)
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+        *argv++ = va_arg(*ap, void *);
+#else
+        *argv++ = va_arg(ap, void *);
+#endif
+      // Increment our nested depth levels, but not increase the serialization
+      if (parent_team == master_th->th.th_serial_team) {
+        // AC: we are in serialized parallel
+        __kmpc_serialized_parallel(loc, gtid);
+        KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
+        // AC: need this in order enquiry functions work
+        // correctly, will restore at join time
+        parent_team->t.t_serialized--;
+#if OMPT_SUPPORT
+        void *dummy;
+        void **exit_runtime_p;
+
+        ompt_lw_taskteam_t lw_taskteam;
+
+        if (ompt_enabled.enabled) {
+          __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                  &ompt_parallel_data, return_address);
+          exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
+
+          __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+          // don't use lw_taskteam after linking. content was swaped
+
+          /* OMPT implicit task begin */
+          implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+          if (ompt_enabled.ompt_callback_implicit_task) {
+            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+            OMPT_CUR_TASK_INFO(master_th)
+                ->thread_num = __kmp_tid_from_gtid(gtid);
+          }
+
+          /* OMPT state */
+          master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+        } else {
+          exit_runtime_p = &dummy;
+        }
+#endif
+
+        {
+          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+          __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
+#if OMPT_SUPPORT
+                                 ,
+                                 exit_runtime_p
+#endif
+                                 );
+        }
+
+#if OMPT_SUPPORT
+        *exit_runtime_p = NULL;
+        if (ompt_enabled.enabled) {
+          OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
+          if (ompt_enabled.ompt_callback_implicit_task) {
+            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                ompt_scope_end, NULL, implicit_task_data, 1,
+                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+          }
+          __ompt_lw_taskteam_unlink(master_th);
+
+          if (ompt_enabled.ompt_callback_parallel_end) {
+            ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+                OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
+                OMPT_INVOKER(call_context), return_address);
+          }
+          master_th->th.ompt_thread_info.state = ompt_state_overhead;
+        }
+#endif
+        return TRUE;
+      }
+
+      parent_team->t.t_pkfn = microtask;
+      parent_team->t.t_invoke = invoker;
+      KMP_ATOMIC_INC(&root->r.r_in_parallel);
+      parent_team->t.t_active_level++;
+      parent_team->t.t_level++;
+      parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
+
+      /* Change number of threads in the team if requested */
+      if (master_set_numthreads) { // The parallel has num_threads clause
+        if (master_set_numthreads < master_th->th.th_teams_size.nth) {
+          // AC: only can reduce number of threads dynamically, can't increase
+          kmp_info_t **other_threads = parent_team->t.t_threads;
+          parent_team->t.t_nproc = master_set_numthreads;
+          for (i = 0; i < master_set_numthreads; ++i) {
+            other_threads[i]->th.th_team_nproc = master_set_numthreads;
+          }
+          // Keep extra threads hot in the team for possible next parallels
+        }
+        master_th->th.th_set_nproc = 0;
+      }
+
+#if USE_DEBUGGER
+      if (__kmp_debugging) { // Let debugger override number of threads.
+        int nth = __kmp_omp_num_threads(loc);
+        if (nth > 0) { // 0 means debugger doesn't want to change num threads
+          master_set_numthreads = nth;
+        }
+      }
+#endif
+
+      KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
+                    "master_th=%p, gtid=%d\n",
+                    root, parent_team, master_th, gtid));
+      __kmp_internal_fork(loc, gtid, parent_team);
+      KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
+                    "master_th=%p, gtid=%d\n",
+                    root, parent_team, master_th, gtid));
+
+      /* Invoke microtask for MASTER thread */
+      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
+                    parent_team->t.t_id, parent_team->t.t_pkfn));
+
+      if (!parent_team->t.t_invoke(gtid)) {
+        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+      }
+      KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
+                    parent_team->t.t_id, parent_team->t.t_pkfn));
+      KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
+
+      return TRUE;
+    } // Parallel closely nested in teams construct
+
+#if KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
+                       parent_team->t.t_task_team[master_th->th.th_task_state]);
+    }
+#endif
+
+    if (parent_team->t.t_active_level >=
+        master_th->th.th_current_task->td_icvs.max_active_levels) {
+      nthreads = 1;
+    } else {
+      int enter_teams = ((ap == NULL && active_level == 0) ||
+                         (ap && teams_level > 0 && teams_level == level));
+      nthreads =
+          master_set_numthreads
+              ? master_set_numthreads
+              : get__nproc_2(
+                    parent_team,
+                    master_tid); // TODO: get nproc directly from current task
+
+      // Check if we need to take forkjoin lock? (no need for serialized
+      // parallel out of teams construct). This code moved here from
+      // __kmp_reserve_threads() to speedup nested serialized parallels.
+      if (nthreads > 1) {
+        if ((get__max_active_levels(master_th) == 1 &&
+             (root->r.r_in_parallel && !enter_teams)) ||
+            (__kmp_library == library_serial)) {
+          KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
+                        " threads\n",
+                        gtid, nthreads));
+          nthreads = 1;
+        }
+      }
+      if (nthreads > 1) {
+        /* determine how many new threads we can use */
+        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+        /* AC: If we execute teams from parallel region (on host), then teams
+           should be created but each can only have 1 thread if nesting is
+           disabled. If teams called from serial region, then teams and their
+           threads should be created regardless of the nesting setting. */
+        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
+                                         nthreads, enter_teams);
+        if (nthreads == 1) {
+          // Free lock for single thread execution here; for multi-thread
+          // execution it will be freed later after team of threads created
+          // and initialized
+          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+        }
+      }
+    }
+    KMP_DEBUG_ASSERT(nthreads > 0);
+
+    // If we temporarily changed the set number of threads then restore it now
+    master_th->th.th_set_nproc = 0;
+
+    /* create a serialized parallel region? */
+    if (nthreads == 1) {
+/* josh todo: hypothetical question: what do we do for OS X*? */
+#if KMP_OS_LINUX &&                                                            \
+    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+      void *args[argc];
+#else
+      void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
+#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
+          KMP_ARCH_AARCH64) */
+
+      KA_TRACE(20,
+               ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
+
+      __kmpc_serialized_parallel(loc, gtid);
+
+      if (call_context == fork_context_intel) {
+        /* TODO this sucks, use the compiler itself to pass args! :) */
+        master_th->th.th_serial_team->t.t_ident = loc;
+        if (!ap) {
+          // revert change made in __kmpc_serialized_parallel()
+          master_th->th.th_serial_team->t.t_level--;
+// Get args from parent team for teams construct
+
+#if OMPT_SUPPORT
+          void *dummy;
+          void **exit_runtime_p;
+          ompt_task_info_t *task_info;
+
+          ompt_lw_taskteam_t lw_taskteam;
+
+          if (ompt_enabled.enabled) {
+            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                    &ompt_parallel_data, return_address);
+
+            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+            // don't use lw_taskteam after linking. content was swaped
+
+            task_info = OMPT_CUR_TASK_INFO(master_th);
+            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+              OMPT_CUR_TASK_INFO(master_th)
+                  ->thread_num = __kmp_tid_from_gtid(gtid);
+            }
+
+            /* OMPT state */
+            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+          } else {
+            exit_runtime_p = &dummy;
+          }
+#endif
+
+          {
+            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+            __kmp_invoke_microtask(microtask, gtid, 0, argc,
+                                   parent_team->t.t_argv
+#if OMPT_SUPPORT
+                                   ,
+                                   exit_runtime_p
+#endif
+                                   );
+          }
+
+#if OMPT_SUPPORT
+          if (ompt_enabled.enabled) {
+            exit_runtime_p = NULL;
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_end, NULL, &(task_info->task_data), 1,
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+            }
+
+            __ompt_lw_taskteam_unlink(master_th);
+            if (ompt_enabled.ompt_callback_parallel_end) {
+              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+                  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
+                  OMPT_INVOKER(call_context), return_address);
+            }
+            master_th->th.ompt_thread_info.state = ompt_state_overhead;
+          }
+#endif
+        } else if (microtask == (microtask_t)__kmp_teams_master) {
+          KMP_DEBUG_ASSERT(master_th->th.th_team ==
+                           master_th->th.th_serial_team);
+          team = master_th->th.th_team;
+          // team->t.t_pkfn = microtask;
+          team->t.t_invoke = invoker;
+          __kmp_alloc_argv_entries(argc, team, TRUE);
+          team->t.t_argc = argc;
+          argv = (void **)team->t.t_argv;
+          if (ap) {
+            for (i = argc - 1; i >= 0; --i)
+// TODO: revert workaround for Intel(R) 64 tracker #96
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+              *argv++ = va_arg(*ap, void *);
+#else
+              *argv++ = va_arg(ap, void *);
+#endif
+          } else {
+            for (i = 0; i < argc; ++i)
+              // Get args from parent team for teams construct
+              argv[i] = parent_team->t.t_argv[i];
+          }
+          // AC: revert change made in __kmpc_serialized_parallel()
+          //     because initial code in teams should have level=0
+          team->t.t_level--;
+          // AC: call special invoker for outer "parallel" of teams construct
+          invoker(gtid);
+        } else {
+          argv = args;
+          for (i = argc - 1; i >= 0; --i)
+// TODO: revert workaround for Intel(R) 64 tracker #96
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+            *argv++ = va_arg(*ap, void *);
+#else
+            *argv++ = va_arg(ap, void *);
+#endif
+          KMP_MB();
+
+#if OMPT_SUPPORT
+          void *dummy;
+          void **exit_runtime_p;
+          ompt_task_info_t *task_info;
+
+          ompt_lw_taskteam_t lw_taskteam;
+
+          if (ompt_enabled.enabled) {
+            __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                    &ompt_parallel_data, return_address);
+            __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
+            // don't use lw_taskteam after linking. content was swaped
+            task_info = OMPT_CUR_TASK_INFO(master_th);
+            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
+
+            /* OMPT implicit task begin */
+            implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+              OMPT_CUR_TASK_INFO(master_th)
+                  ->thread_num = __kmp_tid_from_gtid(gtid);
+            }
+
+            /* OMPT state */
+            master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+          } else {
+            exit_runtime_p = &dummy;
+          }
+#endif
+
+          {
+            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
+            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
+            __kmp_invoke_microtask(microtask, gtid, 0, argc, args
+#if OMPT_SUPPORT
+                                   ,
+                                   exit_runtime_p
+#endif
+                                   );
+          }
+
+#if OMPT_SUPPORT
+          if (ompt_enabled.enabled) {
+            *exit_runtime_p = NULL;
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_end, NULL, &(task_info->task_data), 1,
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+            }
+
+            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+            __ompt_lw_taskteam_unlink(master_th);
+            if (ompt_enabled.ompt_callback_parallel_end) {
+              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+                  &ompt_parallel_data, parent_task_data,
+                  OMPT_INVOKER(call_context), return_address);
+            }
+            master_th->th.ompt_thread_info.state = ompt_state_overhead;
+          }
+#endif
+        }
+      } else if (call_context == fork_context_gnu) {
+#if OMPT_SUPPORT
+        ompt_lw_taskteam_t lwt;
+        __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
+                                return_address);
+
+        lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
+        __ompt_lw_taskteam_link(&lwt, master_th, 1);
+// don't use lw_taskteam after linking. content was swaped
+#endif
+
+        // we were called from GNU native code
+        KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
+        return FALSE;
+      } else {
+        KMP_ASSERT2(call_context < fork_context_last,
+                    "__kmp_fork_call: unknown fork_context parameter");
+      }
+
+      KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
+      KMP_MB();
+      return FALSE;
+    } // if (nthreads == 1)
+
+    // GEH: only modify the executing flag in the case when not serialized
+    //      serialized case is handled in kmpc_serialized_parallel
+    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
+                  "curtask=%p, curtask_max_aclevel=%d\n",
+                  parent_team->t.t_active_level, master_th,
+                  master_th->th.th_current_task,
+                  master_th->th.th_current_task->td_icvs.max_active_levels));
+    // TODO: GEH - cannot do this assertion because root thread not set up as
+    // executing
+    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
+    master_th->th.th_current_task->td_flags.executing = 0;
+
+    if (!master_th->th.th_teams_microtask || level > teams_level) {
+      /* Increment our nested depth level */
+      KMP_ATOMIC_INC(&root->r.r_in_parallel);
+    }
+
+    // See if we need to make a copy of the ICVs.
+    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
+    if ((level + 1 < __kmp_nested_nth.used) &&
+        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
+      nthreads_icv = __kmp_nested_nth.nth[level + 1];
+    } else {
+      nthreads_icv = 0; // don't update
+    }
+
+    // Figure out the proc_bind_policy for the new team.
+    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+    kmp_proc_bind_t proc_bind_icv =
+        proc_bind_default; // proc_bind_default means don't update
+    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+      proc_bind = proc_bind_false;
+    } else {
+      if (proc_bind == proc_bind_default) {
+        // No proc_bind clause specified; use current proc-bind-var for this
+        // parallel region
+        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+      }
+      /* else: The proc_bind policy was specified explicitly on parallel clause.
+         This overrides proc-bind-var for this parallel region, but does not
+         change proc-bind-var. */
+      // Figure the value of proc-bind-var for the child threads.
+      if ((level + 1 < __kmp_nested_proc_bind.used) &&
+          (__kmp_nested_proc_bind.bind_types[level + 1] !=
+           master_th->th.th_current_task->td_icvs.proc_bind)) {
+        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+      }
+    }
+
+    // Reset for next parallel region
+    master_th->th.th_set_proc_bind = proc_bind_default;
+
+    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
+      kmp_internal_control_t new_icvs;
+      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
+      new_icvs.next = NULL;
+      if (nthreads_icv > 0) {
+        new_icvs.nproc = nthreads_icv;
+      }
+      if (proc_bind_icv != proc_bind_default) {
+        new_icvs.proc_bind = proc_bind_icv;
+      }
+
+      /* allocate a new parallel team */
+      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
+      team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                 ompt_parallel_data,
+#endif
+                                 proc_bind, &new_icvs,
+                                 argc USE_NESTED_HOT_ARG(master_th));
+    } else {
+      /* allocate a new parallel team */
+      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
+      team = __kmp_allocate_team(root, nthreads, nthreads,
+#if OMPT_SUPPORT
+                                 ompt_parallel_data,
+#endif
+                                 proc_bind,
+                                 &master_th->th.th_current_task->td_icvs,
+                                 argc USE_NESTED_HOT_ARG(master_th));
+    }
+    KF_TRACE(
+        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
+
+    /* setup the new team */
+    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
+    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
+    KMP_CHECK_UPDATE(team->t.t_ident, loc);
+    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
+    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
+#if OMPT_SUPPORT
+    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
+                          return_address);
+#endif
+    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
+    // TODO: parent_team->t.t_level == INT_MAX ???
+    if (!master_th->th.th_teams_microtask || level > teams_level) {
+      int new_level = parent_team->t.t_level + 1;
+      KMP_CHECK_UPDATE(team->t.t_level, new_level);
+      new_level = parent_team->t.t_active_level + 1;
+      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
+    } else {
+      // AC: Do not increase parallel level at start of the teams construct
+      int new_level = parent_team->t.t_level;
+      KMP_CHECK_UPDATE(team->t.t_level, new_level);
+      new_level = parent_team->t.t_active_level;
+      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
+    }
+    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
+    // set master's schedule as new run-time schedule
+    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
+
+    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
+    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
+
+    // Update the floating point rounding in the team if required.
+    propagateFPControl(team);
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Set master's task team to team's task team. Unless this is hot team, it
+      // should be NULL.
+      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
+                       parent_team->t.t_task_team[master_th->th.th_task_state]);
+      KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
+                    "%p, new task_team %p / team %p\n",
+                    __kmp_gtid_from_thread(master_th),
+                    master_th->th.th_task_team, parent_team,
+                    team->t.t_task_team[master_th->th.th_task_state], team));
+
+      if (active_level || master_th->th.th_task_team) {
+        // Take a memo of master's task_state
+        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
+        if (master_th->th.th_task_state_top >=
+            master_th->th.th_task_state_stack_sz) { // increase size
+          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
+          kmp_uint8 *old_stack, *new_stack;
+          kmp_uint32 i;
+          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
+          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
+            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
+          }
+          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
+               ++i) { // zero-init rest of stack
+            new_stack[i] = 0;
+          }
+          old_stack = master_th->th.th_task_state_memo_stack;
+          master_th->th.th_task_state_memo_stack = new_stack;
+          master_th->th.th_task_state_stack_sz = new_size;
+          __kmp_free(old_stack);
+        }
+        // Store master's task_state on stack
+        master_th->th
+            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
+            master_th->th.th_task_state;
+        master_th->th.th_task_state_top++;
+#if KMP_NESTED_HOT_TEAMS
+        if (master_th->th.th_hot_teams &&
+            active_level < __kmp_hot_teams_max_level &&
+            team == master_th->th.th_hot_teams[active_level].hot_team) {
+          // Restore master's nested state if nested hot team
+          master_th->th.th_task_state =
+              master_th->th
+                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
+        } else {
+#endif
+          master_th->th.th_task_state = 0;
+#if KMP_NESTED_HOT_TEAMS
+        }
+#endif
+      }
+#if !KMP_NESTED_HOT_TEAMS
+      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
+                       (team == root->r.r_hot_team));
+#endif
+    }
+
+    KA_TRACE(
+        20,
+        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
+         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
+         team->t.t_nproc));
+    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
+                     (team->t.t_master_tid == 0 &&
+                      (team->t.t_parent == root->r.r_root_team ||
+                       team->t.t_parent->t.t_serialized)));
+    KMP_MB();
+
+    /* now, setup the arguments */
+    argv = (void **)team->t.t_argv;
+    if (ap) {
+      for (i = argc - 1; i >= 0; --i) {
+// TODO: revert workaround for Intel(R) 64 tracker #96
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+        void *new_argv = va_arg(*ap, void *);
+#else
+        void *new_argv = va_arg(ap, void *);
+#endif
+        KMP_CHECK_UPDATE(*argv, new_argv);
+        argv++;
+      }
+    } else {
+      for (i = 0; i < argc; ++i) {
+        // Get args from parent team for teams construct
+        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
+      }
+    }
+
+    /* now actually fork the threads */
+    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
+    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
+      root->r.r_active = TRUE;
+
+    __kmp_fork_team_threads(root, team, master_th, gtid);
+    __kmp_setup_icv_copy(team, nthreads,
+                         &master_th->th.th_current_task->td_icvs, loc);
+
+#if OMPT_SUPPORT
+    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
+#endif
+
+    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+#if USE_ITT_BUILD
+    if (team->t.t_active_level == 1 // only report frames at level 1
+        && !master_th->th.th_teams_microtask) { // not in teams construct
+#if USE_ITT_NOTIFY
+      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+          (__kmp_forkjoin_frames_mode == 3 ||
+           __kmp_forkjoin_frames_mode == 1)) {
+        kmp_uint64 tmp_time = 0;
+        if (__itt_get_timestamp_ptr)
+          tmp_time = __itt_get_timestamp();
+        // Internal fork - report frame begin
+        master_th->th.th_frame_time = tmp_time;
+        if (__kmp_forkjoin_frames_mode == 3)
+          team->t.t_region_time = tmp_time;
+      } else
+// only one notification scheme (either "submit" or "forking/joined", not both)
+#endif /* USE_ITT_NOTIFY */
+          if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
+              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
+        // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
+        __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
+      }
+    }
+#endif /* USE_ITT_BUILD */
+
+    /* now go on and do the work */
+    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
+    KMP_MB();
+    KF_TRACE(10,
+             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
+              root, team, master_th, gtid));
+
+#if USE_ITT_BUILD
+    if (__itt_stack_caller_create_ptr) {
+      team->t.t_stack_id =
+          __kmp_itt_stack_caller_create(); // create new stack stitching id
+      // before entering fork barrier
+    }
+#endif /* USE_ITT_BUILD */
+
+    // AC: skip __kmp_internal_fork at teams construct, let only master
+    // threads execute
+    if (ap) {
+      __kmp_internal_fork(loc, gtid, team);
+      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
+                    "master_th=%p, gtid=%d\n",
+                    root, team, master_th, gtid));
+    }
+
+    if (call_context == fork_context_gnu) {
+      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
+      return TRUE;
+    }
+
+    /* Invoke microtask for MASTER thread */
+    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
+                  team->t.t_id, team->t.t_pkfn));
+  } // END of timer KMP_fork_call block
+
+#if KMP_STATS_ENABLED
+  // If beginning a teams construct, then change thread state
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (!ap) {
+    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
+  }
+#endif
+
+  if (!team->t.t_invoke(gtid)) {
+    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
+  }
+
+#if KMP_STATS_ENABLED
+  // If was beginning of a teams construct, then reset thread state
+  if (!ap) {
+    KMP_SET_THREAD_STATE(previous_state);
+  }
+#endif
+
+  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
+                team->t.t_id, team->t.t_pkfn));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+  }
+#endif
+
+  return TRUE;
+}
+
+#if OMPT_SUPPORT
+static inline void __kmp_join_restore_state(kmp_info_t *thread,
+                                            kmp_team_t *team) {
+  // restore state outside the region
+  thread->th.ompt_thread_info.state =
+      ((team->t.t_serialized) ? ompt_state_work_serial
+                              : ompt_state_work_parallel);
+}
+
+static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
+                                   kmp_team_t *team, ompt_data_t *parallel_data,
+                                   fork_context_e fork_context, void *codeptr) {
+  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+  if (ompt_enabled.ompt_callback_parallel_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+        parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
+        codeptr);
+  }
+
+  task_info->frame.enter_frame = ompt_data_none;
+  __kmp_join_restore_state(thread, team);
+}
+#endif
+
+void __kmp_join_call(ident_t *loc, int gtid
+#if OMPT_SUPPORT
+                     ,
+                     enum fork_context_e fork_context
+#endif
+                     ,
+                     int exit_teams) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
+  kmp_team_t *team;
+  kmp_team_t *parent_team;
+  kmp_info_t *master_th;
+  kmp_root_t *root;
+  int master_active;
+
+  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
+
+  /* setup current data */
+  master_th = __kmp_threads[gtid];
+  root = master_th->th.th_root;
+  team = master_th->th.th_team;
+  parent_team = team->t.t_parent;
+
+  master_th->th.th_ident = loc;
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    master_th->th.ompt_thread_info.state = ompt_state_overhead;
+  }
+#endif
+
+#if KMP_DEBUG
+  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
+    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
+                  "th_task_team = %p\n",
+                  __kmp_gtid_from_thread(master_th), team,
+                  team->t.t_task_team[master_th->th.th_task_state],
+                  master_th->th.th_task_team));
+    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
+                     team->t.t_task_team[master_th->th.th_task_state]);
+  }
+#endif
+
+  if (team->t.t_serialized) {
+    if (master_th->th.th_teams_microtask) {
+      // We are in teams construct
+      int level = team->t.t_level;
+      int tlevel = master_th->th.th_teams_level;
+      if (level == tlevel) {
+        // AC: we haven't incremented it earlier at start of teams construct,
+        //     so do it here - at the end of teams construct
+        team->t.t_level++;
+      } else if (level == tlevel + 1) {
+        // AC: we are exiting parallel inside teams, need to increment
+        // serialization in order to restore it in the next call to
+        // __kmpc_end_serialized_parallel
+        team->t.t_serialized++;
+      }
+    }
+    __kmpc_end_serialized_parallel(loc, gtid);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      __kmp_join_restore_state(master_th, parent_team);
+    }
+#endif
+
+    return;
+  }
+
+  master_active = team->t.t_master_active;
+
+  if (!exit_teams) {
+    // AC: No barrier for internal teams at exit from teams construct.
+    //     But there is barrier for external team (league).
+    __kmp_internal_join(loc, gtid, team);
+  } else {
+    master_th->th.th_task_state =
+        0; // AC: no tasking in teams (out of any parallel)
+  }
+
+  KMP_MB();
+
+#if OMPT_SUPPORT
+  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
+  void *codeptr = team->t.ompt_team_info.master_return_address;
+#endif
+
+#if USE_ITT_BUILD
+  if (__itt_stack_caller_create_ptr) {
+    __kmp_itt_stack_caller_destroy(
+        (__itt_caller)team->t
+            .t_stack_id); // destroy the stack stitching id after join barrier
+  }
+
+  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
+  if (team->t.t_active_level == 1 &&
+      !master_th->th.th_teams_microtask) { /* not in teams construct */
+    master_th->th.th_ident = loc;
+    // only one notification scheme (either "submit" or "forking/joined", not
+    // both)
+    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
+        __kmp_forkjoin_frames_mode == 3)
+      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
+                             master_th->th.th_frame_time, 0, loc,
+                             master_th->th.th_team_nproc, 1);
+    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
+             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
+      __kmp_itt_region_joined(gtid);
+  } // active_level == 1
+#endif /* USE_ITT_BUILD */
+
+  if (master_th->th.th_teams_microtask && !exit_teams &&
+      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+      team->t.t_level == master_th->th.th_teams_level + 1) {
+    // AC: We need to leave the team structure intact at the end of parallel
+    // inside the teams construct, so that at the next parallel same (hot) team
+    // works, only adjust nesting levels
+
+    /* Decrement our nested depth level */
+    team->t.t_level--;
+    team->t.t_active_level--;
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
+
+    // Restore number of threads in the team if needed. This code relies on
+    // the proper adjustment of th_teams_size.nth after the fork in
+    // __kmp_teams_master on each teams master in the case that
+    // __kmp_reserve_threads reduced it.
+    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
+      int old_num = master_th->th.th_team_nproc;
+      int new_num = master_th->th.th_teams_size.nth;
+      kmp_info_t **other_threads = team->t.t_threads;
+      team->t.t_nproc = new_num;
+      for (int i = 0; i < old_num; ++i) {
+        other_threads[i]->th.th_team_nproc = new_num;
+      }
+      // Adjust states of non-used threads of the team
+      for (int i = old_num; i < new_num; ++i) {
+        // Re-initialize thread's barrier data.
+        KMP_DEBUG_ASSERT(other_threads[i]);
+        kmp_balign_t *balign = other_threads[i]->th.th_bar;
+        for (int b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+          // Synchronize thread's task state
+          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
+        }
+      }
+    }
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
+                      codeptr);
+    }
+#endif
+
+    return;
+  }
+
+  /* do cleanup and restore the parent team */
+  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
+
+  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
+
+  /* jc: The following lock has instructions with REL and ACQ semantics,
+     separating the parallel user code called in this parallel region
+     from the serial user code called after this function returns. */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  if (!master_th->th.th_teams_microtask ||
+      team->t.t_level > master_th->th.th_teams_level) {
+    /* Decrement our nested depth level */
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
+  }
+  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      int ompt_team_size = team->t.t_nproc;
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
+          OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+    }
+
+    task_info->frame.exit_frame = ompt_data_none;
+    task_info->task_data = ompt_data_none;
+  }
+#endif
+
+  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
+                master_th, team));
+  __kmp_pop_current_task_from_thread(master_th);
+
+#if KMP_AFFINITY_SUPPORTED
+  // Restore master thread's partition.
+  master_th->th.th_first_place = team->t.t_first_place;
+  master_th->th.th_last_place = team->t.t_last_place;
+#endif // KMP_AFFINITY_SUPPORTED
+  master_th->th.th_def_allocator = team->t.t_def_allocator;
+
+  updateHWFPControl(team);
+
+  if (root->r.r_active != master_active)
+    root->r.r_active = master_active;
+
+  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
+                            master_th)); // this will free worker threads
+
+  /* this race was fun to find. make sure the following is in the critical
+     region otherwise assertions may fail occasionally since the old team may be
+     reallocated and the hierarchy appears inconsistent. it is actually safe to
+     run and won't cause any bugs, but will cause those assertion failures. it's
+     only one deref&assign so might as well put this in the critical region */
+  master_th->th.th_team = parent_team;
+  master_th->th.th_team_nproc = parent_team->t.t_nproc;
+  master_th->th.th_team_master = parent_team->t.t_threads[0];
+  master_th->th.th_team_serialized = parent_team->t.t_serialized;
+
+  /* restore serialized team, if need be */
+  if (parent_team->t.t_serialized &&
+      parent_team != master_th->th.th_serial_team &&
+      parent_team != root->r.r_root_team) {
+    __kmp_free_team(root,
+                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
+    master_th->th.th_serial_team = parent_team;
+  }
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    if (master_th->th.th_task_state_top >
+        0) { // Restore task state from memo stack
+      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
+      // Remember master's state if we re-use this nested hot team
+      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
+          master_th->th.th_task_state;
+      --master_th->th.th_task_state_top; // pop
+      // Now restore state at this level
+      master_th->th.th_task_state =
+          master_th->th
+              .th_task_state_memo_stack[master_th->th.th_task_state_top];
+    }
+    // Copy the task team from the parent team to the master thread
+    master_th->th.th_task_team =
+        parent_team->t.t_task_team[master_th->th.th_task_state];
+    KA_TRACE(20,
+             ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
+              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+              parent_team));
+  }
+
+  // TODO: GEH - cannot do this assertion because root thread not set up as
+  // executing
+  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
+  master_th->th.th_current_task->td_flags.executing = 1;
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
+                    codeptr);
+  }
+#endif
+
+  KMP_MB();
+  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
+}
+
+/* Check whether we should push an internal control record onto the
+   serial team stack.  If so, do it.  */
+void __kmp_save_internal_controls(kmp_info_t *thread) {
+
+  if (thread->th.th_team != thread->th.th_serial_team) {
+    return;
+  }
+  if (thread->th.th_team->t.t_serialized > 1) {
+    int push = 0;
+
+    if (thread->th.th_team->t.t_control_stack_top == NULL) {
+      push = 1;
+    } else {
+      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
+          thread->th.th_team->t.t_serialized) {
+        push = 1;
+      }
+    }
+    if (push) { /* push a record on the serial team's stack */
+      kmp_internal_control_t *control =
+          (kmp_internal_control_t *)__kmp_allocate(
+              sizeof(kmp_internal_control_t));
+
+      copy_icvs(control, &thread->th.th_current_task->td_icvs);
+
+      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
+
+      control->next = thread->th.th_team->t.t_control_stack_top;
+      thread->th.th_team->t.t_control_stack_top = control;
+    }
+  }
+}
+
+/* Changes set_nproc */
+void __kmp_set_num_threads(int new_nth, int gtid) {
+  kmp_info_t *thread;
+  kmp_root_t *root;
+
+  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  if (new_nth < 1)
+    new_nth = 1;
+  else if (new_nth > __kmp_max_nth)
+    new_nth = __kmp_max_nth;
+
+  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
+  thread = __kmp_threads[gtid];
+  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
+    return; // nothing to do
+
+  __kmp_save_internal_controls(thread);
+
+  set__nproc(thread, new_nth);
+
+  // If this omp_set_num_threads() call will cause the hot team size to be
+  // reduced (in the absence of a num_threads clause), then reduce it now,
+  // rather than waiting for the next parallel region.
+  root = thread->th.th_root;
+  if (__kmp_init_parallel && (!root->r.r_active) &&
+      (root->r.r_hot_team->t.t_nproc > new_nth)
+#if KMP_NESTED_HOT_TEAMS
+      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
+#endif
+      ) {
+    kmp_team_t *hot_team = root->r.r_hot_team;
+    int f;
+
+    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+    // Release the extra threads we don't need any more.
+    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
+      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        // When decreasing team size, threads no longer in the team should unref
+        // task team.
+        hot_team->t.t_threads[f]->th.th_task_team = NULL;
+      }
+      __kmp_free_thread(hot_team->t.t_threads[f]);
+      hot_team->t.t_threads[f] = NULL;
+    }
+    hot_team->t.t_nproc = new_nth;
+#if KMP_NESTED_HOT_TEAMS
+    if (thread->th.th_hot_teams) {
+      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
+      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
+    }
+#endif
+
+    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+    // Update the t_nproc field in the threads that are still active.
+    for (f = 0; f < new_nth; f++) {
+      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
+      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
+    }
+    // Special flag in case omp_set_num_threads() call
+    hot_team->t.t_size_changed = -1;
+  }
+}
+
+/* Changes max_active_levels */
+void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
+  kmp_info_t *thread;
+
+  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
+                "%d = (%d)\n",
+                gtid, max_active_levels));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // validate max_active_levels
+  if (max_active_levels < 0) {
+    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
+    // We ignore this call if the user has specified a negative value.
+    // The current setting won't be changed. The last valid setting will be
+    // used. A warning will be issued (if warnings are allowed as controlled by
+    // the KMP_WARNINGS env var).
+    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
+                  "max_active_levels for thread %d = (%d)\n",
+                  gtid, max_active_levels));
+    return;
+  }
+  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
+    // it's OK, the max_active_levels is within the valid range: [ 0;
+    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
+    // We allow a zero value. (implementation defined behavior)
+  } else {
+    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
+                KMP_MAX_ACTIVE_LEVELS_LIMIT);
+    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+    // Current upper limit is MAX_INT. (implementation defined behavior)
+    // If the input exceeds the upper limit, we correct the input to be the
+    // upper limit. (implementation defined behavior)
+    // Actually, the flow should never get here until we use MAX_INT limit.
+  }
+  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
+                "max_active_levels for thread %d = (%d)\n",
+                gtid, max_active_levels));
+
+  thread = __kmp_threads[gtid];
+
+  __kmp_save_internal_controls(thread);
+
+  set__max_active_levels(thread, max_active_levels);
+}
+
+/* Gets max_active_levels */
+int __kmp_get_max_active_levels(int gtid) {
+  kmp_info_t *thread;
+
+  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  thread = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(thread->th.th_current_task);
+  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
+                "curtask_maxaclevel=%d\n",
+                gtid, thread->th.th_current_task,
+                thread->th.th_current_task->td_icvs.max_active_levels));
+  return thread->th.th_current_task->td_icvs.max_active_levels;
+}
+
+KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
+KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
+
+/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
+void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
+  kmp_info_t *thread;
+  kmp_sched_t orig_kind;
+  //    kmp_team_t *team;
+
+  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
+                gtid, (int)kind, chunk));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // Check if the kind parameter is valid, correct if needed.
+  // Valid parameters should fit in one of two intervals - standard or extended:
+  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
+  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
+  orig_kind = kind;
+  kind = __kmp_sched_without_mods(kind);
+
+  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
+      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
+    // TODO: Hint needs attention in case we change the default schedule.
+    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
+              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
+              __kmp_msg_null);
+    kind = kmp_sched_default;
+    chunk = 0; // ignore chunk value in case of bad kind
+  }
+
+  thread = __kmp_threads[gtid];
+
+  __kmp_save_internal_controls(thread);
+
+  if (kind < kmp_sched_upper_std) {
+    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
+      // differ static chunked vs. unchunked:  chunk should be invalid to
+      // indicate unchunked schedule (which is the default)
+      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
+    } else {
+      thread->th.th_current_task->td_icvs.sched.r_sched_type =
+          __kmp_sch_map[kind - kmp_sched_lower - 1];
+    }
+  } else {
+    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
+    //    kmp_sched_lower - 2 ];
+    thread->th.th_current_task->td_icvs.sched.r_sched_type =
+        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
+                      kmp_sched_lower - 2];
+  }
+  __kmp_sched_apply_mods_intkind(
+      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
+  if (kind == kmp_sched_auto || chunk < 1) {
+    // ignore parameter chunk for schedule auto
+    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
+  } else {
+    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
+  }
+}
+
+/* Gets def_sched_var ICV values */
+void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
+  kmp_info_t *thread;
+  enum sched_type th_type;
+
+  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  thread = __kmp_threads[gtid];
+
+  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
+  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
+  case kmp_sch_static:
+  case kmp_sch_static_greedy:
+  case kmp_sch_static_balanced:
+    *kind = kmp_sched_static;
+    __kmp_sched_apply_mods_stdkind(kind, th_type);
+    *chunk = 0; // chunk was not set, try to show this fact via zero value
+    return;
+  case kmp_sch_static_chunked:
+    *kind = kmp_sched_static;
+    break;
+  case kmp_sch_dynamic_chunked:
+    *kind = kmp_sched_dynamic;
+    break;
+  case kmp_sch_guided_chunked:
+  case kmp_sch_guided_iterative_chunked:
+  case kmp_sch_guided_analytical_chunked:
+    *kind = kmp_sched_guided;
+    break;
+  case kmp_sch_auto:
+    *kind = kmp_sched_auto;
+    break;
+  case kmp_sch_trapezoidal:
+    *kind = kmp_sched_trapezoidal;
+    break;
+#if KMP_STATIC_STEAL_ENABLED
+  case kmp_sch_static_steal:
+    *kind = kmp_sched_static_steal;
+    break;
+#endif
+  default:
+    KMP_FATAL(UnknownSchedulingType, th_type);
+  }
+
+  __kmp_sched_apply_mods_stdkind(kind, th_type);
+  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
+}
+
+int __kmp_get_ancestor_thread_num(int gtid, int level) {
+
+  int ii, dd;
+  kmp_team_t *team;
+  kmp_info_t *thr;
+
+  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // validate level
+  if (level == 0)
+    return 0;
+  if (level < 0)
+    return -1;
+  thr = __kmp_threads[gtid];
+  team = thr->th.th_team;
+  ii = team->t.t_level;
+  if (level > ii)
+    return -1;
+
+  if (thr->th.th_teams_microtask) {
+    // AC: we are in teams region where multiple nested teams have same level
+    int tlevel = thr->th.th_teams_level; // the level of the teams construct
+    if (level <=
+        tlevel) { // otherwise usual algorithm works (will not touch the teams)
+      KMP_DEBUG_ASSERT(ii >= tlevel);
+      // AC: As we need to pass by the teams league, we need to artificially
+      // increase ii
+      if (ii == tlevel) {
+        ii += 2; // three teams have same level
+      } else {
+        ii++; // two teams have same level
+      }
+    }
+  }
+
+  if (ii == level)
+    return __kmp_tid_from_gtid(gtid);
+
+  dd = team->t.t_serialized;
+  level++;
+  while (ii > level) {
+    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
+    }
+    if ((team->t.t_serialized) && (!dd)) {
+      team = team->t.t_parent;
+      continue;
+    }
+    if (ii > level) {
+      team = team->t.t_parent;
+      dd = team->t.t_serialized;
+      ii--;
+    }
+  }
+
+  return (dd > 1) ? (0) : (team->t.t_master_tid);
+}
+
+int __kmp_get_team_size(int gtid, int level) {
+
+  int ii, dd;
+  kmp_team_t *team;
+  kmp_info_t *thr;
+
+  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+
+  // validate level
+  if (level == 0)
+    return 1;
+  if (level < 0)
+    return -1;
+  thr = __kmp_threads[gtid];
+  team = thr->th.th_team;
+  ii = team->t.t_level;
+  if (level > ii)
+    return -1;
+
+  if (thr->th.th_teams_microtask) {
+    // AC: we are in teams region where multiple nested teams have same level
+    int tlevel = thr->th.th_teams_level; // the level of the teams construct
+    if (level <=
+        tlevel) { // otherwise usual algorithm works (will not touch the teams)
+      KMP_DEBUG_ASSERT(ii >= tlevel);
+      // AC: As we need to pass by the teams league, we need to artificially
+      // increase ii
+      if (ii == tlevel) {
+        ii += 2; // three teams have same level
+      } else {
+        ii++; // two teams have same level
+      }
+    }
+  }
+
+  while (ii > level) {
+    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
+    }
+    if (team->t.t_serialized && (!dd)) {
+      team = team->t.t_parent;
+      continue;
+    }
+    if (ii > level) {
+      team = team->t.t_parent;
+      ii--;
+    }
+  }
+
+  return team->t.t_nproc;
+}
+
+kmp_r_sched_t __kmp_get_schedule_global() {
+  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
+  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
+  // independently. So one can get the updated schedule here.
+
+  kmp_r_sched_t r_sched;
+
+  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
+  // __kmp_guided. __kmp_sched should keep original value, so that user can set
+  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
+  // different roots (even in OMP 2.5)
+  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
+  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
+  if (s == kmp_sch_static) {
+    // replace STATIC with more detailed schedule (balanced or greedy)
+    r_sched.r_sched_type = __kmp_static;
+  } else if (s == kmp_sch_guided_chunked) {
+    // replace GUIDED with more detailed schedule (iterative or analytical)
+    r_sched.r_sched_type = __kmp_guided;
+  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
+    r_sched.r_sched_type = __kmp_sched;
+  }
+  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
+
+  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
+    // __kmp_chunk may be wrong here (if it was not ever set)
+    r_sched.chunk = KMP_DEFAULT_CHUNK;
+  } else {
+    r_sched.chunk = __kmp_chunk;
+  }
+
+  return r_sched;
+}
+
+/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
+   at least argc number of *t_argv entries for the requested team. */
+static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
+
+  KMP_DEBUG_ASSERT(team);
+  if (!realloc || argc > team->t.t_max_argc) {
+
+    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
+                   "current entries=%d\n",
+                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
+    /* if previously allocated heap space for args, free them */
+    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
+      __kmp_free((void *)team->t.t_argv);
+
+    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
+      /* use unused space in the cache line for arguments */
+      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
+      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
+                     "argv entries\n",
+                     team->t.t_id, team->t.t_max_argc));
+      team->t.t_argv = &team->t.t_inline_argv[0];
+      if (__kmp_storage_map) {
+        __kmp_print_storage_map_gtid(
+            -1, &team->t.t_inline_argv[0],
+            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
+            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
+            team->t.t_id);
+      }
+    } else {
+      /* allocate space for arguments in the heap */
+      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
+                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
+                               : 2 * argc;
+      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
+                     "argv entries\n",
+                     team->t.t_id, team->t.t_max_argc));
+      team->t.t_argv =
+          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
+      if (__kmp_storage_map) {
+        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
+                                     &team->t.t_argv[team->t.t_max_argc],
+                                     sizeof(void *) * team->t.t_max_argc,
+                                     "team_%d.t_argv", team->t.t_id);
+      }
+    }
+  }
+}
+
+static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
+  int i;
+  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
+  team->t.t_threads =
+      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
+  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
+      sizeof(dispatch_shared_info_t) * num_disp_buff);
+  team->t.t_dispatch =
+      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
+  team->t.t_implicit_task_taskdata =
+      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
+  team->t.t_max_nproc = max_nth;
+
+  /* setup dispatch buffers */
+  for (i = 0; i < num_disp_buff; ++i) {
+    team->t.t_disp_buffer[i].buffer_index = i;
+    team->t.t_disp_buffer[i].doacross_buf_idx = i;
+  }
+}
+
+static void __kmp_free_team_arrays(kmp_team_t *team) {
+  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
+  int i;
+  for (i = 0; i < team->t.t_max_nproc; ++i) {
+    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
+      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
+      team->t.t_dispatch[i].th_disp_buffer = NULL;
+    }
+  }
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_free_hierarchies(team);
+#endif
+  __kmp_free(team->t.t_threads);
+  __kmp_free(team->t.t_disp_buffer);
+  __kmp_free(team->t.t_dispatch);
+  __kmp_free(team->t.t_implicit_task_taskdata);
+  team->t.t_threads = NULL;
+  team->t.t_disp_buffer = NULL;
+  team->t.t_dispatch = NULL;
+  team->t.t_implicit_task_taskdata = 0;
+}
+
+static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
+  kmp_info_t **oldThreads = team->t.t_threads;
+
+  __kmp_free(team->t.t_disp_buffer);
+  __kmp_free(team->t.t_dispatch);
+  __kmp_free(team->t.t_implicit_task_taskdata);
+  __kmp_allocate_team_arrays(team, max_nth);
+
+  KMP_MEMCPY(team->t.t_threads, oldThreads,
+             team->t.t_nproc * sizeof(kmp_info_t *));
+
+  __kmp_free(oldThreads);
+}
+
+static kmp_internal_control_t __kmp_get_global_icvs(void) {
+
+  kmp_r_sched_t r_sched =
+      __kmp_get_schedule_global(); // get current state of scheduling globals
+
+  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
+
+  kmp_internal_control_t g_icvs = {
+    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
+    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
+    // adjustment of threads (per thread)
+    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
+    // whether blocktime is explicitly set
+    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
+#if KMP_USE_MONITOR
+    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
+// intervals
+#endif
+    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
+    // next parallel region (per thread)
+    // (use a max ub on value if __kmp_parallel_initialize not called yet)
+    __kmp_cg_max_nth, // int thread_limit;
+    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
+    // for max_active_levels
+    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
+    // {sched,chunk} pair
+    __kmp_nested_proc_bind.bind_types[0],
+    __kmp_default_device,
+    NULL // struct kmp_internal_control *next;
+  };
+
+  return g_icvs;
+}
+
+static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
+
+  kmp_internal_control_t gx_icvs;
+  gx_icvs.serial_nesting_level =
+      0; // probably =team->t.t_serial like in save_inter_controls
+  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
+  gx_icvs.next = NULL;
+
+  return gx_icvs;
+}
+
+static void __kmp_initialize_root(kmp_root_t *root) {
+  int f;
+  kmp_team_t *root_team;
+  kmp_team_t *hot_team;
+  int hot_team_max_nth;
+  kmp_r_sched_t r_sched =
+      __kmp_get_schedule_global(); // get current state of scheduling globals
+  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
+  KMP_DEBUG_ASSERT(root);
+  KMP_ASSERT(!root->r.r_begin);
+
+  /* setup the root state structure */
+  __kmp_init_lock(&root->r.r_begin_lock);
+  root->r.r_begin = FALSE;
+  root->r.r_active = FALSE;
+  root->r.r_in_parallel = 0;
+  root->r.r_blocktime = __kmp_dflt_blocktime;
+
+  /* setup the root team for this task */
+  /* allocate the root team structure */
+  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
+
+  root_team =
+      __kmp_allocate_team(root,
+                          1, // new_nproc
+                          1, // max_nproc
+#if OMPT_SUPPORT
+                          ompt_data_none, // root parallel id
+#endif
+                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
+                          0 // argc
+                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
+                          );
+#if USE_DEBUGGER
+  // Non-NULL value should be assigned to make the debugger display the root
+  // team.
+  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
+#endif
+
+  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
+
+  root->r.r_root_team = root_team;
+  root_team->t.t_control_stack_top = NULL;
+
+  /* initialize root team */
+  root_team->t.t_threads[0] = NULL;
+  root_team->t.t_nproc = 1;
+  root_team->t.t_serialized = 1;
+  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+  root_team->t.t_sched.sched = r_sched.sched;
+  KA_TRACE(
+      20,
+      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
+       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+
+  /* setup the  hot team for this task */
+  /* allocate the hot team structure */
+  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
+
+  hot_team =
+      __kmp_allocate_team(root,
+                          1, // new_nproc
+                          __kmp_dflt_team_nth_ub * 2, // max_nproc
+#if OMPT_SUPPORT
+                          ompt_data_none, // root parallel id
+#endif
+                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
+                          0 // argc
+                          USE_NESTED_HOT_ARG(NULL) // master thread is unknown
+                          );
+  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
+
+  root->r.r_hot_team = hot_team;
+  root_team->t.t_control_stack_top = NULL;
+
+  /* first-time initialization */
+  hot_team->t.t_parent = root_team;
+
+  /* initialize hot team */
+  hot_team_max_nth = hot_team->t.t_max_nproc;
+  for (f = 0; f < hot_team_max_nth; ++f) {
+    hot_team->t.t_threads[f] = NULL;
+  }
+  hot_team->t.t_nproc = 1;
+  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+  hot_team->t.t_sched.sched = r_sched.sched;
+  hot_team->t.t_size_changed = 0;
+}
+
+#ifdef KMP_DEBUG
+
+typedef struct kmp_team_list_item {
+  kmp_team_p const *entry;
+  struct kmp_team_list_item *next;
+} kmp_team_list_item_t;
+typedef kmp_team_list_item_t *kmp_team_list_t;
+
+static void __kmp_print_structure_team_accum( // Add team to list of teams.
+    kmp_team_list_t list, // List of teams.
+    kmp_team_p const *team // Team to add.
+    ) {
+
+  // List must terminate with item where both entry and next are NULL.
+  // Team is added to the list only once.
+  // List is sorted in ascending order by team id.
+  // Team id is *not* a key.
+
+  kmp_team_list_t l;
+
+  KMP_DEBUG_ASSERT(list != NULL);
+  if (team == NULL) {
+    return;
+  }
+
+  __kmp_print_structure_team_accum(list, team->t.t_parent);
+  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
+
+  // Search list for the team.
+  l = list;
+  while (l->next != NULL && l->entry != team) {
+    l = l->next;
+  }
+  if (l->next != NULL) {
+    return; // Team has been added before, exit.
+  }
+
+  // Team is not found. Search list again for insertion point.
+  l = list;
+  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
+    l = l->next;
+  }
+
+  // Insert team.
+  {
+    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
+        sizeof(kmp_team_list_item_t));
+    *item = *l;
+    l->entry = team;
+    l->next = item;
+  }
+}
+
+static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
+
+                                       ) {
+  __kmp_printf("%s", title);
+  if (team != NULL) {
+    __kmp_printf("%2x %p\n", team->t.t_id, team);
+  } else {
+    __kmp_printf(" - (nil)\n");
+  }
+}
+
+static void __kmp_print_structure_thread(char const *title,
+                                         kmp_info_p const *thread) {
+  __kmp_printf("%s", title);
+  if (thread != NULL) {
+    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
+  } else {
+    __kmp_printf(" - (nil)\n");
+  }
+}
+
+void __kmp_print_structure(void) {
+
+  kmp_team_list_t list;
+
+  // Initialize list of teams.
+  list =
+      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
+  list->entry = NULL;
+  list->next = NULL;
+
+  __kmp_printf("\n------------------------------\nGlobal Thread "
+               "Table\n------------------------------\n");
+  {
+    int gtid;
+    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
+      __kmp_printf("%2d", gtid);
+      if (__kmp_threads != NULL) {
+        __kmp_printf(" %p", __kmp_threads[gtid]);
+      }
+      if (__kmp_root != NULL) {
+        __kmp_printf(" %p", __kmp_root[gtid]);
+      }
+      __kmp_printf("\n");
+    }
+  }
+
+  // Print out __kmp_threads array.
+  __kmp_printf("\n------------------------------\nThreads\n--------------------"
+               "----------\n");
+  if (__kmp_threads != NULL) {
+    int gtid;
+    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
+      kmp_info_t const *thread = __kmp_threads[gtid];
+      if (thread != NULL) {
+        __kmp_printf("GTID %2d %p:\n", gtid, thread);
+        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
+        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
+        __kmp_print_structure_team("    Serial Team:  ",
+                                   thread->th.th_serial_team);
+        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
+        __kmp_print_structure_thread("    Master:       ",
+                                     thread->th.th_team_master);
+        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
+        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
+        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
+        __kmp_print_structure_thread("    Next in pool: ",
+                                     thread->th.th_next_pool);
+        __kmp_printf("\n");
+        __kmp_print_structure_team_accum(list, thread->th.th_team);
+        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
+      }
+    }
+  } else {
+    __kmp_printf("Threads array is not allocated.\n");
+  }
+
+  // Print out __kmp_root array.
+  __kmp_printf("\n------------------------------\nUbers\n----------------------"
+               "--------\n");
+  if (__kmp_root != NULL) {
+    int gtid;
+    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
+      kmp_root_t const *root = __kmp_root[gtid];
+      if (root != NULL) {
+        __kmp_printf("GTID %2d %p:\n", gtid, root);
+        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
+        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
+        __kmp_print_structure_thread("    Uber Thread:  ",
+                                     root->r.r_uber_thread);
+        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
+        __kmp_printf("    In Parallel:  %2d\n",
+                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
+        __kmp_printf("\n");
+        __kmp_print_structure_team_accum(list, root->r.r_root_team);
+        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
+      }
+    }
+  } else {
+    __kmp_printf("Ubers array is not allocated.\n");
+  }
+
+  __kmp_printf("\n------------------------------\nTeams\n----------------------"
+               "--------\n");
+  while (list->next != NULL) {
+    kmp_team_p const *team = list->entry;
+    int i;
+    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
+    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
+    __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
+    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
+    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
+    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
+    for (i = 0; i < team->t.t_nproc; ++i) {
+      __kmp_printf("    Thread %2d:      ", i);
+      __kmp_print_structure_thread("", team->t.t_threads[i]);
+    }
+    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
+    __kmp_printf("\n");
+    list = list->next;
+  }
+
+  // Print out __kmp_thread_pool and __kmp_team_pool.
+  __kmp_printf("\n------------------------------\nPools\n----------------------"
+               "--------\n");
+  __kmp_print_structure_thread("Thread pool:          ",
+                               CCAST(kmp_info_t *, __kmp_thread_pool));
+  __kmp_print_structure_team("Team pool:            ",
+                             CCAST(kmp_team_t *, __kmp_team_pool));
+  __kmp_printf("\n");
+
+  // Free team list.
+  while (list != NULL) {
+    kmp_team_list_item_t *item = list;
+    list = list->next;
+    KMP_INTERNAL_FREE(item);
+  }
+}
+
+#endif
+
+//---------------------------------------------------------------------------
+//  Stuff for per-thread fast random number generator
+//  Table of primes
+static const unsigned __kmp_primes[] = {
+    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
+    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
+    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
+    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
+    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
+    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
+    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
+    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
+    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
+    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
+    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
+
+//---------------------------------------------------------------------------
+//  __kmp_get_random: Get a random number using a linear congruential method.
+unsigned short __kmp_get_random(kmp_info_t *thread) {
+  unsigned x = thread->th.th_x;
+  unsigned short r = x >> 16;
+
+  thread->th.th_x = x * thread->th.th_a + 1;
+
+  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
+                thread->th.th_info.ds.ds_tid, r));
+
+  return r;
+}
+//--------------------------------------------------------
+// __kmp_init_random: Initialize a random number generator
+void __kmp_init_random(kmp_info_t *thread) {
+  unsigned seed = thread->th.th_info.ds.ds_tid;
+
+  thread->th.th_a =
+      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
+  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
+  KA_TRACE(30,
+           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
+}
+
+#if KMP_OS_WINDOWS
+/* reclaim array entries for root threads that are already dead, returns number
+ * reclaimed */
+static int __kmp_reclaim_dead_roots(void) {
+  int i, r = 0;
+
+  for (i = 0; i < __kmp_threads_capacity; ++i) {
+    if (KMP_UBER_GTID(i) &&
+        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
+        !__kmp_root[i]
+             ->r.r_active) { // AC: reclaim only roots died in non-active state
+      r += __kmp_unregister_root_other_thread(i);
+    }
+  }
+  return r;
+}
+#endif
+
+/* This function attempts to create free entries in __kmp_threads and
+   __kmp_root, and returns the number of free entries generated.
+
+   For Windows* OS static library, the first mechanism used is to reclaim array
+   entries for root threads that are already dead.
+
+   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
+   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
+   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
+   threadprivate cache array has been created. Synchronization with
+   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
+
+   After any dead root reclamation, if the clipping value allows array expansion
+   to result in the generation of a total of nNeed free slots, the function does
+   that expansion. If not, nothing is done beyond the possible initial root
+   thread reclamation.
+
+   If any argument is negative, the behavior is undefined. */
+static int __kmp_expand_threads(int nNeed) {
+  int added = 0;
+  int minimumRequiredCapacity;
+  int newCapacity;
+  kmp_info_t **newThreads;
+  kmp_root_t **newRoot;
+
+// All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
+// resizing __kmp_threads does not need additional protection if foreign
+// threads are present
+
+#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
+  /* only for Windows static library */
+  /* reclaim array entries for root threads that are already dead */
+  added = __kmp_reclaim_dead_roots();
+
+  if (nNeed) {
+    nNeed -= added;
+    if (nNeed < 0)
+      nNeed = 0;
+  }
+#endif
+  if (nNeed <= 0)
+    return added;
+
+  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
+  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
+  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
+  // > __kmp_max_nth in one of two ways:
+  //
+  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
+  //    may not be resused by another thread, so we may need to increase
+  //    __kmp_threads_capacity to __kmp_max_nth + 1.
+  //
+  // 2) New foreign root(s) are encountered.  We always register new foreign
+  //    roots. This may cause a smaller # of threads to be allocated at
+  //    subsequent parallel regions, but the worker threads hang around (and
+  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
+  //
+  // Anyway, that is the reason for moving the check to see if
+  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
+  // instead of having it performed here. -BB
+
+  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
+
+  /* compute expansion headroom to check if we can expand */
+  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
+    /* possible expansion too small -- give up */
+    return added;
+  }
+  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
+
+  newCapacity = __kmp_threads_capacity;
+  do {
+    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
+                                                          : __kmp_sys_max_nth;
+  } while (newCapacity < minimumRequiredCapacity);
+  newThreads = (kmp_info_t **)__kmp_allocate(
+      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
+  newRoot =
+      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
+  KMP_MEMCPY(newThreads, __kmp_threads,
+             __kmp_threads_capacity * sizeof(kmp_info_t *));
+  KMP_MEMCPY(newRoot, __kmp_root,
+             __kmp_threads_capacity * sizeof(kmp_root_t *));
+
+  kmp_info_t **temp_threads = __kmp_threads;
+  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
+  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
+  __kmp_free(temp_threads);
+  added += newCapacity - __kmp_threads_capacity;
+  *(volatile int *)&__kmp_threads_capacity = newCapacity;
+
+  if (newCapacity > __kmp_tp_capacity) {
+    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
+    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
+      __kmp_threadprivate_resize_cache(newCapacity);
+    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
+      *(volatile int *)&__kmp_tp_capacity = newCapacity;
+    }
+    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+  }
+
+  return added;
+}
+
+/* Register the current thread as a root thread and obtain our gtid. We must
+   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
+   thread that calls from __kmp_do_serial_initialize() */
+int __kmp_register_root(int initial_thread) {
+  kmp_info_t *root_thread;
+  kmp_root_t *root;
+  int gtid;
+  int capacity;
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+  KA_TRACE(20, ("__kmp_register_root: entered\n"));
+  KMP_MB();
+
+  /* 2007-03-02:
+     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
+     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
+     work as expected -- it may return false (that means there is at least one
+     empty slot in __kmp_threads array), but it is possible the only free slot
+     is #0, which is reserved for initial thread and so cannot be used for this
+     one. Following code workarounds this bug.
+
+     However, right solution seems to be not reserving slot #0 for initial
+     thread because:
+     (1) there is no magic in slot #0,
+     (2) we cannot detect initial thread reliably (the first thread which does
+        serial initialization may be not a real initial thread).
+  */
+  capacity = __kmp_threads_capacity;
+  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
+    --capacity;
+  }
+
+  /* see if there are too many threads */
+  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
+    if (__kmp_tp_cached) {
+      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
+                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
+                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
+    } else {
+      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
+                  __kmp_msg_null);
+    }
+  }
+
+  /* find an available thread slot */
+  /* Don't reassign the zero slot since we need that to only be used by initial
+     thread */
+  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
+       gtid++)
+    ;
+  KA_TRACE(1,
+           ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
+  KMP_ASSERT(gtid < __kmp_threads_capacity);
+
+  /* update global accounting */
+  __kmp_all_nth++;
+  TCW_4(__kmp_nth, __kmp_nth + 1);
+
+  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
+  // numbers of procs, and method #2 (keyed API call) for higher numbers.
+  if (__kmp_adjust_gtid_mode) {
+    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
+      if (TCR_4(__kmp_gtid_mode) != 2) {
+        TCW_4(__kmp_gtid_mode, 2);
+      }
+    } else {
+      if (TCR_4(__kmp_gtid_mode) != 1) {
+        TCW_4(__kmp_gtid_mode, 1);
+      }
+    }
+  }
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime to zero if necessary            */
+  /* Middle initialization might not have occurred yet */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    if (__kmp_nth > __kmp_avail_proc) {
+      __kmp_zero_bt = TRUE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  /* setup this new hierarchy */
+  if (!(root = __kmp_root[gtid])) {
+    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
+    KMP_DEBUG_ASSERT(!root->r.r_root_team);
+  }
+
+#if KMP_STATS_ENABLED
+  // Initialize stats as soon as possible (right after gtid assignment).
+  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
+  __kmp_stats_thread_ptr->startLife();
+  KMP_SET_THREAD_STATE(SERIAL_REGION);
+  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
+#endif
+  __kmp_initialize_root(root);
+
+  /* setup new root thread structure */
+  if (root->r.r_uber_thread) {
+    root_thread = root->r.r_uber_thread;
+  } else {
+    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
+    if (__kmp_storage_map) {
+      __kmp_print_thread_storage_map(root_thread, gtid);
+    }
+    root_thread->th.th_info.ds.ds_gtid = gtid;
+#if OMPT_SUPPORT
+    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
+#endif
+    root_thread->th.th_root = root;
+    if (__kmp_env_consistency_check) {
+      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
+    }
+#if USE_FAST_MEMORY
+    __kmp_initialize_fast_memory(root_thread);
+#endif /* USE_FAST_MEMORY */
+
+#if KMP_USE_BGET
+    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
+    __kmp_initialize_bget(root_thread);
+#endif
+    __kmp_init_random(root_thread); // Initialize random number generator
+  }
+
+  /* setup the serial team held in reserve by the root thread */
+  if (!root_thread->th.th_serial_team) {
+    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
+    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
+    root_thread->th.th_serial_team = __kmp_allocate_team(
+        root, 1, 1,
+#if OMPT_SUPPORT
+        ompt_data_none, // root parallel id
+#endif
+        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
+  }
+  KMP_ASSERT(root_thread->th.th_serial_team);
+  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
+                root_thread->th.th_serial_team));
+
+  /* drop root_thread into place */
+  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
+
+  root->r.r_root_team->t.t_threads[0] = root_thread;
+  root->r.r_hot_team->t.t_threads[0] = root_thread;
+  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
+  // AC: the team created in reserve, not for execution (it is unused for now).
+  root_thread->th.th_serial_team->t.t_serialized = 0;
+  root->r.r_uber_thread = root_thread;
+
+  /* initialize the thread, get it ready to go */
+  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
+  TCW_4(__kmp_init_gtid, TRUE);
+
+  /* prepare the master thread for get_gtid() */
+  __kmp_gtid_set_specific(gtid);
+
+#if USE_ITT_BUILD
+  __kmp_itt_thread_name(gtid);
+#endif /* USE_ITT_BUILD */
+
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = gtid;
+#endif
+  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
+  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
+
+  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
+                "plain=%u\n",
+                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
+                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
+                KMP_INIT_BARRIER_STATE));
+  { // Initialize barrier data.
+    int b;
+    for (b = 0; b < bs_last_barrier; ++b) {
+      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
+#endif
+    }
+  }
+  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
+                   KMP_INIT_BARRIER_STATE);
+
+#if KMP_AFFINITY_SUPPORTED
+  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
+  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
+  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
+  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
+  if (TCR_4(__kmp_init_middle)) {
+    __kmp_affinity_set_init_mask(gtid, TRUE);
+  }
+#endif /* KMP_AFFINITY_SUPPORTED */
+  root_thread->th.th_def_allocator = __kmp_def_allocator;
+  root_thread->th.th_prev_level = 0;
+  root_thread->th.th_prev_num_threads = 1;
+
+  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
+  tmp->cg_root = root_thread;
+  tmp->cg_thread_limit = __kmp_cg_max_nth;
+  tmp->cg_nthreads = 1;
+  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
+                 " cg_nthreads init to 1\n",
+                 root_thread, tmp));
+  tmp->up = NULL;
+  root_thread->th.th_cg_roots = tmp;
+
+  __kmp_root_counter++;
+
+#if OMPT_SUPPORT
+  if (!initial_thread && ompt_enabled.enabled) {
+
+    kmp_info_t *root_thread = ompt_get_thread();
+
+    ompt_set_thread_state(root_thread, ompt_state_overhead);
+
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_initial, __ompt_get_thread_data_internal());
+    }
+    ompt_data_t *task_data;
+    ompt_data_t *parallel_data;
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
+    }
+
+    ompt_set_thread_state(root_thread, ompt_state_work_serial);
+  }
+#endif
+
+  KMP_MB();
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  return gtid;
+}
+
+#if KMP_NESTED_HOT_TEAMS
+static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
+                                const int max_level) {
+  int i, n, nth;
+  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
+  if (!hot_teams || !hot_teams[level].hot_team) {
+    return 0;
+  }
+  KMP_DEBUG_ASSERT(level < max_level);
+  kmp_team_t *team = hot_teams[level].hot_team;
+  nth = hot_teams[level].hot_team_nth;
+  n = nth - 1; // master is not freed
+  if (level < max_level - 1) {
+    for (i = 0; i < nth; ++i) {
+      kmp_info_t *th = team->t.t_threads[i];
+      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
+      if (i > 0 && th->th.th_hot_teams) {
+        __kmp_free(th->th.th_hot_teams);
+        th->th.th_hot_teams = NULL;
+      }
+    }
+  }
+  __kmp_free_team(root, team, NULL);
+  return n;
+}
+#endif
+
+// Resets a root thread and clear its root and hot teams.
+// Returns the number of __kmp_threads entries directly and indirectly freed.
+static int __kmp_reset_root(int gtid, kmp_root_t *root) {
+  kmp_team_t *root_team = root->r.r_root_team;
+  kmp_team_t *hot_team = root->r.r_hot_team;
+  int n = hot_team->t.t_nproc;
+  int i;
+
+  KMP_DEBUG_ASSERT(!root->r.r_active);
+
+  root->r.r_root_team = NULL;
+  root->r.r_hot_team = NULL;
+  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
+  // before call to __kmp_free_team().
+  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
+#if KMP_NESTED_HOT_TEAMS
+  if (__kmp_hot_teams_max_level >
+      0) { // need to free nested hot teams and their threads if any
+    for (i = 0; i < hot_team->t.t_nproc; ++i) {
+      kmp_info_t *th = hot_team->t.t_threads[i];
+      if (__kmp_hot_teams_max_level > 1) {
+        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
+      }
+      if (th->th.th_hot_teams) {
+        __kmp_free(th->th.th_hot_teams);
+        th->th.th_hot_teams = NULL;
+      }
+    }
+  }
+#endif
+  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
+
+  // Before we can reap the thread, we need to make certain that all other
+  // threads in the teams that had this root as ancestor have stopped trying to
+  // steal tasks.
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    __kmp_wait_to_unref_task_teams();
+  }
+
+#if KMP_OS_WINDOWS
+  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
+  KA_TRACE(
+      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
+           "\n",
+           (LPVOID) & (root->r.r_uber_thread->th),
+           root->r.r_uber_thread->th.th_info.ds.ds_thread));
+  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
+#endif /* KMP_OS_WINDOWS */
+
+#if OMPT_SUPPORT
+  ompt_data_t *task_data;
+  ompt_data_t *parallel_data;
+  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
+  if (ompt_enabled.ompt_callback_implicit_task) {
+    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+        ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
+  }
+  if (ompt_enabled.ompt_callback_thread_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
+        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
+  }
+#endif
+
+  TCW_4(__kmp_nth,
+        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
+  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
+  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
+                 " to %d\n",
+                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
+                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
+  if (i == 1) {
+    // need to free contention group structure
+    KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
+                     root->r.r_uber_thread->th.th_cg_roots->cg_root);
+    KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
+    __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
+    root->r.r_uber_thread->th.th_cg_roots = NULL;
+  }
+  __kmp_reap_thread(root->r.r_uber_thread, 1);
+
+  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
+  // of freeing.
+  root->r.r_uber_thread = NULL;
+  /* mark root as no longer in use */
+  root->r.r_begin = FALSE;
+
+  return n;
+}
+
+void __kmp_unregister_root_current_thread(int gtid) {
+  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
+  /* this lock should be ok, since unregister_root_current_thread is never
+     called during an abort, only during a normal close. furthermore, if you
+     have the forkjoin lock, you should never try to get the initz lock */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
+                  "exiting T#%d\n",
+                  gtid));
+    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+    return;
+  }
+  kmp_root_t *root = __kmp_root[gtid];
+
+  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
+  KMP_ASSERT(root->r.r_active == FALSE);
+
+  KMP_MB();
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_team_t *team = thread->th.th_team;
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+
+  // we need to wait for the proxy tasks before finishing the thread
+  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
+#if OMPT_SUPPORT
+    // the runtime is shutting down so we won't report any events
+    thread->th.ompt_thread_info.state = ompt_state_undefined;
+#endif
+    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
+  }
+
+  __kmp_reset_root(gtid, root);
+
+  /* free up this thread slot */
+  __kmp_gtid_set_specific(KMP_GTID_DNE);
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = KMP_GTID_DNE;
+#endif
+
+  KMP_MB();
+  KC_TRACE(10,
+           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+}
+
+#if KMP_OS_WINDOWS
+/* __kmp_forkjoin_lock must be already held
+   Unregisters a root thread that is not the current thread.  Returns the number
+   of __kmp_threads entries freed as a result. */
+static int __kmp_unregister_root_other_thread(int gtid) {
+  kmp_root_t *root = __kmp_root[gtid];
+  int r;
+
+  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
+  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
+  KMP_ASSERT(root->r.r_active == FALSE);
+
+  r = __kmp_reset_root(gtid, root);
+  KC_TRACE(10,
+           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
+  return r;
+}
+#endif
+
+#if KMP_DEBUG
+void __kmp_task_info() {
+
+  kmp_int32 gtid = __kmp_entry_gtid();
+  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *steam = this_thr->th.th_serial_team;
+  kmp_team_t *team = this_thr->th.th_team;
+
+  __kmp_printf(
+      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
+      "ptask=%p\n",
+      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
+      team->t.t_implicit_task_taskdata[tid].td_parent);
+}
+#endif // KMP_DEBUG
+
+/* TODO optimize with one big memclr, take out what isn't needed, split
+   responsibility to workers as much as possible, and delay initialization of
+   features as much as possible  */
+static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
+                                  int tid, int gtid) {
+  /* this_thr->th.th_info.ds.ds_gtid is setup in
+     kmp_allocate_thread/create_worker.
+     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
+  kmp_info_t *master = team->t.t_threads[0];
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+  KMP_DEBUG_ASSERT(team->t.t_dispatch);
+  KMP_DEBUG_ASSERT(master);
+  KMP_DEBUG_ASSERT(master->th.th_root);
+
+  KMP_MB();
+
+  TCW_SYNC_PTR(this_thr->th.th_team, team);
+
+  this_thr->th.th_info.ds.ds_tid = tid;
+  this_thr->th.th_set_nproc = 0;
+  if (__kmp_tasking_mode != tskm_immediate_exec)
+    // When tasking is possible, threads are not safe to reap until they are
+    // done tasking; this will be set when tasking code is exited in wait
+    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+  else // no tasking --> always safe to reap
+    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+  this_thr->th.th_set_proc_bind = proc_bind_default;
+#if KMP_AFFINITY_SUPPORTED
+  this_thr->th.th_new_place = this_thr->th.th_current_place;
+#endif
+  this_thr->th.th_root = master->th.th_root;
+
+  /* setup the thread's cache of the team structure */
+  this_thr->th.th_team_nproc = team->t.t_nproc;
+  this_thr->th.th_team_master = master;
+  this_thr->th.th_team_serialized = team->t.t_serialized;
+  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
+
+  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
+
+  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
+                tid, gtid, this_thr, this_thr->th.th_current_task));
+
+  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
+                           team, tid, TRUE);
+
+  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
+                tid, gtid, this_thr, this_thr->th.th_current_task));
+  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
+  // __kmp_initialize_team()?
+
+  /* TODO no worksharing in speculative threads */
+  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
+
+  this_thr->th.th_local.this_construct = 0;
+
+  if (!this_thr->th.th_pri_common) {
+    this_thr->th.th_pri_common =
+        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
+    if (__kmp_storage_map) {
+      __kmp_print_storage_map_gtid(
+          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
+          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
+    }
+    this_thr->th.th_pri_head = NULL;
+  }
+
+  if (this_thr != master && // Master's CG root is initialized elsewhere
+      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
+    // Make new thread's CG root same as master's
+    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
+    kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
+    if (tmp) {
+      // worker changes CG, need to check if old CG should be freed
+      int i = tmp->cg_nthreads--;
+      KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
+                     " on node %p of thread %p to %d\n",
+                     this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
+      if (i == 1) {
+        __kmp_free(tmp); // last thread left CG --> free it
+      }
+    }
+    this_thr->th.th_cg_roots = master->th.th_cg_roots;
+    // Increment new thread's CG root's counter to add the new thread
+    this_thr->th.th_cg_roots->cg_nthreads++;
+    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
+                   " node %p of thread %p to %d\n",
+                   this_thr, this_thr->th.th_cg_roots,
+                   this_thr->th.th_cg_roots->cg_root,
+                   this_thr->th.th_cg_roots->cg_nthreads));
+    this_thr->th.th_current_task->td_icvs.thread_limit =
+        this_thr->th.th_cg_roots->cg_thread_limit;
+  }
+
+  /* Initialize dynamic dispatch */
+  {
+    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
+    // Use team max_nproc since this will never change for the team.
+    size_t disp_size =
+        sizeof(dispatch_private_info_t) *
+        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
+    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
+                  team->t.t_max_nproc));
+    KMP_ASSERT(dispatch);
+    KMP_DEBUG_ASSERT(team->t.t_dispatch);
+    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
+
+    dispatch->th_disp_index = 0;
+    dispatch->th_doacross_buf_idx = 0;
+    if (!dispatch->th_disp_buffer) {
+      dispatch->th_disp_buffer =
+          (dispatch_private_info_t *)__kmp_allocate(disp_size);
+
+      if (__kmp_storage_map) {
+        __kmp_print_storage_map_gtid(
+            gtid, &dispatch->th_disp_buffer[0],
+            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
+                                          ? 1
+                                          : __kmp_dispatch_num_buffers],
+            disp_size, "th_%d.th_dispatch.th_disp_buffer "
+                       "(team_%d.t_dispatch[%d].th_disp_buffer)",
+            gtid, team->t.t_id, gtid);
+      }
+    } else {
+      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
+    }
+
+    dispatch->th_dispatch_pr_current = 0;
+    dispatch->th_dispatch_sh_current = 0;
+
+    dispatch->th_deo_fcn = 0; /* ORDERED     */
+    dispatch->th_dxo_fcn = 0; /* END ORDERED */
+  }
+
+  this_thr->th.th_next_pool = NULL;
+
+  if (!this_thr->th.th_task_state_memo_stack) {
+    size_t i;
+    this_thr->th.th_task_state_memo_stack =
+        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
+    this_thr->th.th_task_state_top = 0;
+    this_thr->th.th_task_state_stack_sz = 4;
+    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
+         ++i) // zero init the stack
+      this_thr->th.th_task_state_memo_stack[i] = 0;
+  }
+
+  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
+  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
+
+  KMP_MB();
+}
+
+/* allocate a new thread for the requesting team. this is only called from
+   within a forkjoin critical section. we will first try to get an available
+   thread from the thread pool. if none is available, we will fork a new one
+   assuming we are able to create a new one. this should be assured, as the
+   caller should check on this first. */
+kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
+                                  int new_tid) {
+  kmp_team_t *serial_team;
+  kmp_info_t *new_thr;
+  int new_gtid;
+
+  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
+  KMP_DEBUG_ASSERT(root && team);
+#if !KMP_NESTED_HOT_TEAMS
+  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
+#endif
+  KMP_MB();
+
+  /* first, try to get one from the thread pool */
+  if (__kmp_thread_pool) {
+    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
+    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
+    if (new_thr == __kmp_thread_pool_insert_pt) {
+      __kmp_thread_pool_insert_pt = NULL;
+    }
+    TCW_4(new_thr->th.th_in_pool, FALSE);
+    __kmp_suspend_initialize_thread(new_thr);
+    __kmp_lock_suspend_mx(new_thr);
+    if (new_thr->th.th_active_in_pool == TRUE) {
+      KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      new_thr->th.th_active_in_pool = FALSE;
+    }
+    __kmp_unlock_suspend_mx(new_thr);
+
+    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
+                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
+    KMP_ASSERT(!new_thr->th.th_team);
+    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
+
+    /* setup the thread structure */
+    __kmp_initialize_info(new_thr, team, new_tid,
+                          new_thr->th.th_info.ds.ds_gtid);
+    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
+
+    TCW_4(__kmp_nth, __kmp_nth + 1);
+
+    new_thr->th.th_task_state = 0;
+    new_thr->th.th_task_state_top = 0;
+    new_thr->th.th_task_state_stack_sz = 4;
+
+#ifdef KMP_ADJUST_BLOCKTIME
+    /* Adjust blocktime back to zero if necessary */
+    /* Middle initialization might not have occurred yet */
+    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+      if (__kmp_nth > __kmp_avail_proc) {
+        __kmp_zero_bt = TRUE;
+      }
+    }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+#if KMP_DEBUG
+    // If thread entered pool via __kmp_free_thread, wait_flag should !=
+    // KMP_BARRIER_PARENT_FLAG.
+    int b;
+    kmp_balign_t *balign = new_thr->th.th_bar;
+    for (b = 0; b < bs_last_barrier; ++b)
+      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#endif
+
+    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
+                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
+
+    KMP_MB();
+    return new_thr;
+  }
+
+  /* no, well fork a new one */
+  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
+  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
+
+#if KMP_USE_MONITOR
+  // If this is the first worker thread the RTL is creating, then also
+  // launch the monitor thread.  We try to do this as early as possible.
+  if (!TCR_4(__kmp_init_monitor)) {
+    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
+    if (!TCR_4(__kmp_init_monitor)) {
+      KF_TRACE(10, ("before __kmp_create_monitor\n"));
+      TCW_4(__kmp_init_monitor, 1);
+      __kmp_create_monitor(&__kmp_monitor);
+      KF_TRACE(10, ("after __kmp_create_monitor\n"));
+#if KMP_OS_WINDOWS
+      // AC: wait until monitor has started. This is a fix for CQ232808.
+      // The reason is that if the library is loaded/unloaded in a loop with
+      // small (parallel) work in between, then there is high probability that
+      // monitor thread started after the library shutdown. At shutdown it is
+      // too late to cope with the problem, because when the master is in
+      // DllMain (process detach) the monitor has no chances to start (it is
+      // blocked), and master has no means to inform the monitor that the
+      // library has gone, because all the memory which the monitor can access
+      // is going to be released/reset.
+      while (TCR_4(__kmp_init_monitor) < 2) {
+        KMP_YIELD(TRUE);
+      }
+      KF_TRACE(10, ("after monitor thread has started\n"));
+#endif
+    }
+    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
+  }
+#endif
+
+  KMP_MB();
+  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
+    KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
+  }
+
+  /* allocate space for it. */
+  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
+
+  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
+
+  if (__kmp_storage_map) {
+    __kmp_print_thread_storage_map(new_thr, new_gtid);
+  }
+
+  // add the reserve serialized team, initialized from the team's master thread
+  {
+    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
+    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
+    new_thr->th.th_serial_team = serial_team =
+        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
+#if OMPT_SUPPORT
+                                          ompt_data_none, // root parallel id
+#endif
+                                          proc_bind_default, &r_icvs,
+                                          0 USE_NESTED_HOT_ARG(NULL));
+  }
+  KMP_ASSERT(serial_team);
+  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
+  // execution (it is unused for now).
+  serial_team->t.t_threads[0] = new_thr;
+  KF_TRACE(10,
+           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
+            new_thr));
+
+  /* setup the thread structures */
+  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
+
+#if USE_FAST_MEMORY
+  __kmp_initialize_fast_memory(new_thr);
+#endif /* USE_FAST_MEMORY */
+
+#if KMP_USE_BGET
+  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
+  __kmp_initialize_bget(new_thr);
+#endif
+
+  __kmp_init_random(new_thr); // Initialize random number generator
+
+  /* Initialize these only once when thread is grabbed for a team allocation */
+  KA_TRACE(20,
+           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
+            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+
+  int b;
+  kmp_balign_t *balign = new_thr->th.th_bar;
+  for (b = 0; b < bs_last_barrier; ++b) {
+    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
+    balign[b].bb.team = NULL;
+    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
+    balign[b].bb.use_oncore_barrier = 0;
+  }
+
+  new_thr->th.th_spin_here = FALSE;
+  new_thr->th.th_next_waiting = 0;
+#if KMP_OS_UNIX
+  new_thr->th.th_blocking = false;
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
+  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
+  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
+  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
+#endif
+  new_thr->th.th_def_allocator = __kmp_def_allocator;
+  new_thr->th.th_prev_level = 0;
+  new_thr->th.th_prev_num_threads = 1;
+
+  TCW_4(new_thr->th.th_in_pool, FALSE);
+  new_thr->th.th_active_in_pool = FALSE;
+  TCW_4(new_thr->th.th_active, TRUE);
+
+  /* adjust the global counters */
+  __kmp_all_nth++;
+  __kmp_nth++;
+
+  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
+  // numbers of procs, and method #2 (keyed API call) for higher numbers.
+  if (__kmp_adjust_gtid_mode) {
+    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
+      if (TCR_4(__kmp_gtid_mode) != 2) {
+        TCW_4(__kmp_gtid_mode, 2);
+      }
+    } else {
+      if (TCR_4(__kmp_gtid_mode) != 1) {
+        TCW_4(__kmp_gtid_mode, 1);
+      }
+    }
+  }
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime back to zero if necessary       */
+  /* Middle initialization might not have occurred yet */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    if (__kmp_nth > __kmp_avail_proc) {
+      __kmp_zero_bt = TRUE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  /* actually fork it and create the new worker thread */
+  KF_TRACE(
+      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
+  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
+  KF_TRACE(10,
+           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
+
+  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
+                new_gtid));
+  KMP_MB();
+  return new_thr;
+}
+
+/* Reinitialize team for reuse.
+   The hot team code calls this case at every fork barrier, so EPCC barrier
+   test are extremely sensitive to changes in it, esp. writes to the team
+   struct, which cause a cache invalidation in all threads.
+   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
+static void __kmp_reinitialize_team(kmp_team_t *team,
+                                    kmp_internal_control_t *new_icvs,
+                                    ident_t *loc) {
+  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
+                team->t.t_threads[0], team));
+  KMP_DEBUG_ASSERT(team && new_icvs);
+  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
+  KMP_CHECK_UPDATE(team->t.t_ident, loc);
+
+  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
+  // Copy ICVs to the master thread's implicit taskdata
+  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
+  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
+
+  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
+                team->t.t_threads[0], team));
+}
+
+/* Initialize the team data structure.
+   This assumes the t_threads and t_max_nproc are already set.
+   Also, we don't touch the arguments */
+static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
+                                  kmp_internal_control_t *new_icvs,
+                                  ident_t *loc) {
+  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
+
+  /* verify */
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+  KMP_MB();
+
+  team->t.t_master_tid = 0; /* not needed */
+  /* team->t.t_master_bar;        not needed */
+  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
+  team->t.t_nproc = new_nproc;
+
+  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
+  team->t.t_next_pool = NULL;
+  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
+   * up hot team */
+
+  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
+  team->t.t_invoke = NULL; /* not needed */
+
+  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
+  team->t.t_sched.sched = new_icvs->sched.sched;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  team->t.t_fp_control_saved = FALSE; /* not needed */
+  team->t.t_x87_fpu_control_word = 0; /* not needed */
+  team->t.t_mxcsr = 0; /* not needed */
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  team->t.t_construct = 0;
+
+  team->t.t_ordered.dt.t_value = 0;
+  team->t.t_master_active = FALSE;
+
+#ifdef KMP_DEBUG
+  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
+#endif
+#if KMP_OS_WINDOWS
+  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
+#endif
+
+  team->t.t_control_stack_top = NULL;
+
+  __kmp_reinitialize_team(team, new_icvs, loc);
+
+  KMP_MB();
+  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
+}
+
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+/* Sets full mask for thread and returns old mask, no changes to structures. */
+static void
+__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
+  if (KMP_AFFINITY_CAPABLE()) {
+    int status;
+    if (old_mask != NULL) {
+      status = __kmp_get_system_affinity(old_mask, TRUE);
+      int error = errno;
+      if (status != 0) {
+        __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
+                    __kmp_msg_null);
+      }
+    }
+    __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
+  }
+}
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+
+// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
+// It calculats the worker + master thread's partition based upon the parent
+// thread's partition, and binds each worker to a thread in their partition.
+// The master thread's partition should already include its current binding.
+static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
+  // Copy the master thread's place partion to the team struct
+  kmp_info_t *master_th = team->t.t_threads[0];
+  KMP_DEBUG_ASSERT(master_th != NULL);
+  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
+  int first_place = master_th->th.th_first_place;
+  int last_place = master_th->th.th_last_place;
+  int masters_place = master_th->th.th_current_place;
+  team->t.t_first_place = first_place;
+  team->t.t_last_place = last_place;
+
+  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
+                "bound to place %d partition = [%d,%d]\n",
+                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
+                team->t.t_id, masters_place, first_place, last_place));
+
+  switch (proc_bind) {
+
+  case proc_bind_default:
+    // serial teams might have the proc_bind policy set to proc_bind_default. It
+    // doesn't matter, as we don't rebind master thread for any proc_bind policy
+    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+    break;
+
+  case proc_bind_master: {
+    int f;
+    int n_th = team->t.t_nproc;
+    for (f = 1; f < n_th; f++) {
+      kmp_info_t *th = team->t.t_threads[f];
+      KMP_DEBUG_ASSERT(th != NULL);
+      th->th.th_first_place = first_place;
+      th->th.th_last_place = last_place;
+      th->th.th_new_place = masters_place;
+      if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
+          team->t.t_display_affinity != 1) {
+        team->t.t_display_affinity = 1;
+      }
+
+      KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
+                     "partition = [%d,%d]\n",
+                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
+                     f, masters_place, first_place, last_place));
+    }
+  } break;
+
+  case proc_bind_close: {
+    int f;
+    int n_th = team->t.t_nproc;
+    int n_places;
+    if (first_place <= last_place) {
+      n_places = last_place - first_place + 1;
+    } else {
+      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+    }
+    if (n_th <= n_places) {
+      int place = masters_place;
+      for (f = 1; f < n_th; f++) {
+        kmp_info_t *th = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(th != NULL);
+
+        if (place == last_place) {
+          place = first_place;
+        } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+          place = 0;
+        } else {
+          place++;
+        }
+        th->th.th_first_place = first_place;
+        th->th.th_last_place = last_place;
+        th->th.th_new_place = place;
+        if (__kmp_display_affinity && place != th->th.th_current_place &&
+            team->t.t_display_affinity != 1) {
+          team->t.t_display_affinity = 1;
+        }
+
+        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
+                       "partition = [%d,%d]\n",
+                       __kmp_gtid_from_thread(team->t.t_threads[f]),
+                       team->t.t_id, f, place, first_place, last_place));
+      }
+    } else {
+      int S, rem, gap, s_count;
+      S = n_th / n_places;
+      s_count = 0;
+      rem = n_th - (S * n_places);
+      gap = rem > 0 ? n_places / rem : n_places;
+      int place = masters_place;
+      int gap_ct = gap;
+      for (f = 0; f < n_th; f++) {
+        kmp_info_t *th = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(th != NULL);
+
+        th->th.th_first_place = first_place;
+        th->th.th_last_place = last_place;
+        th->th.th_new_place = place;
+        if (__kmp_display_affinity && place != th->th.th_current_place &&
+            team->t.t_display_affinity != 1) {
+          team->t.t_display_affinity = 1;
+        }
+        s_count++;
+
+        if ((s_count == S) && rem && (gap_ct == gap)) {
+          // do nothing, add an extra thread to place on next iteration
+        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
+          // we added an extra thread to this place; move to next place
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          s_count = 0;
+          gap_ct = 1;
+          rem--;
+        } else if (s_count == S) { // place full; don't add extra
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          gap_ct++;
+          s_count = 0;
+        }
+
+        KA_TRACE(100,
+                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
+                  "partition = [%d,%d]\n",
+                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
+                  th->th.th_new_place, first_place, last_place));
+      }
+      KMP_DEBUG_ASSERT(place == masters_place);
+    }
+  } break;
+
+  case proc_bind_spread: {
+    int f;
+    int n_th = team->t.t_nproc;
+    int n_places;
+    int thidx;
+    if (first_place <= last_place) {
+      n_places = last_place - first_place + 1;
+    } else {
+      n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
+    }
+    if (n_th <= n_places) {
+      int place = -1;
+
+      if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
+        int S = n_places / n_th;
+        int s_count, rem, gap, gap_ct;
+
+        place = masters_place;
+        rem = n_places - n_th * S;
+        gap = rem ? n_th / rem : 1;
+        gap_ct = gap;
+        thidx = n_th;
+        if (update_master_only == 1)
+          thidx = 1;
+        for (f = 0; f < thidx; f++) {
+          kmp_info_t *th = team->t.t_threads[f];
+          KMP_DEBUG_ASSERT(th != NULL);
+
+          th->th.th_first_place = place;
+          th->th.th_new_place = place;
+          if (__kmp_display_affinity && place != th->th.th_current_place &&
+              team->t.t_display_affinity != 1) {
+            team->t.t_display_affinity = 1;
+          }
+          s_count = 1;
+          while (s_count < S) {
+            if (place == last_place) {
+              place = first_place;
+            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+              place = 0;
+            } else {
+              place++;
+            }
+            s_count++;
+          }
+          if (rem && (gap_ct == gap)) {
+            if (place == last_place) {
+              place = first_place;
+            } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+              place = 0;
+            } else {
+              place++;
+            }
+            rem--;
+            gap_ct = 0;
+          }
+          th->th.th_last_place = place;
+          gap_ct++;
+
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+
+          KA_TRACE(100,
+                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
+                    "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
+                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
+                    f, th->th.th_new_place, th->th.th_first_place,
+                    th->th.th_last_place, __kmp_affinity_num_masks));
+        }
+      } else {
+        /* Having uniform space of available computation places I can create
+           T partitions of round(P/T) size and put threads into the first
+           place of each partition. */
+        double current = static_cast<double>(masters_place);
+        double spacing =
+            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
+        int first, last;
+        kmp_info_t *th;
+
+        thidx = n_th + 1;
+        if (update_master_only == 1)
+          thidx = 1;
+        for (f = 0; f < thidx; f++) {
+          first = static_cast<int>(current);
+          last = static_cast<int>(current + spacing) - 1;
+          KMP_DEBUG_ASSERT(last >= first);
+          if (first >= n_places) {
+            if (masters_place) {
+              first -= n_places;
+              last -= n_places;
+              if (first == (masters_place + 1)) {
+                KMP_DEBUG_ASSERT(f == n_th);
+                first--;
+              }
+              if (last == masters_place) {
+                KMP_DEBUG_ASSERT(f == (n_th - 1));
+                last--;
+              }
+            } else {
+              KMP_DEBUG_ASSERT(f == n_th);
+              first = 0;
+              last = 0;
+            }
+          }
+          if (last >= n_places) {
+            last = (n_places - 1);
+          }
+          place = first;
+          current += spacing;
+          if (f < n_th) {
+            KMP_DEBUG_ASSERT(0 <= first);
+            KMP_DEBUG_ASSERT(n_places > first);
+            KMP_DEBUG_ASSERT(0 <= last);
+            KMP_DEBUG_ASSERT(n_places > last);
+            KMP_DEBUG_ASSERT(last_place >= first_place);
+            th = team->t.t_threads[f];
+            KMP_DEBUG_ASSERT(th);
+            th->th.th_first_place = first;
+            th->th.th_new_place = place;
+            th->th.th_last_place = last;
+            if (__kmp_display_affinity && place != th->th.th_current_place &&
+                team->t.t_display_affinity != 1) {
+              team->t.t_display_affinity = 1;
+            }
+            KA_TRACE(100,
+                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
+                      "partition = [%d,%d], spacing = %.4f\n",
+                      __kmp_gtid_from_thread(team->t.t_threads[f]),
+                      team->t.t_id, f, th->th.th_new_place,
+                      th->th.th_first_place, th->th.th_last_place, spacing));
+          }
+        }
+      }
+      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
+    } else {
+      int S, rem, gap, s_count;
+      S = n_th / n_places;
+      s_count = 0;
+      rem = n_th - (S * n_places);
+      gap = rem > 0 ? n_places / rem : n_places;
+      int place = masters_place;
+      int gap_ct = gap;
+      thidx = n_th;
+      if (update_master_only == 1)
+        thidx = 1;
+      for (f = 0; f < thidx; f++) {
+        kmp_info_t *th = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(th != NULL);
+
+        th->th.th_first_place = place;
+        th->th.th_last_place = place;
+        th->th.th_new_place = place;
+        if (__kmp_display_affinity && place != th->th.th_current_place &&
+            team->t.t_display_affinity != 1) {
+          team->t.t_display_affinity = 1;
+        }
+        s_count++;
+
+        if ((s_count == S) && rem && (gap_ct == gap)) {
+          // do nothing, add an extra thread to place on next iteration
+        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
+          // we added an extra thread to this place; move on to next place
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          s_count = 0;
+          gap_ct = 1;
+          rem--;
+        } else if (s_count == S) { // place is full; don't add extra thread
+          if (place == last_place) {
+            place = first_place;
+          } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
+            place = 0;
+          } else {
+            place++;
+          }
+          gap_ct++;
+          s_count = 0;
+        }
+
+        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
+                       "partition = [%d,%d]\n",
+                       __kmp_gtid_from_thread(team->t.t_threads[f]),
+                       team->t.t_id, f, th->th.th_new_place,
+                       th->th.th_first_place, th->th.th_last_place));
+      }
+      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
+    }
+  } break;
+
+  default:
+    break;
+  }
+
+  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
+}
+
+#endif // KMP_AFFINITY_SUPPORTED
+
+/* allocate a new team data structure to use.  take one off of the free pool if
+   available */
+kmp_team_t *
+__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
+#if OMPT_SUPPORT
+                    ompt_data_t ompt_parallel_data,
+#endif
+                    kmp_proc_bind_t new_proc_bind,
+                    kmp_internal_control_t *new_icvs,
+                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
+  int f;
+  kmp_team_t *team;
+  int use_hot_team = !root->r.r_active;
+  int level = 0;
+
+  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
+  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
+  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
+  KMP_MB();
+
+#if KMP_NESTED_HOT_TEAMS
+  kmp_hot_team_ptr_t *hot_teams;
+  if (master) {
+    team = master->th.th_team;
+    level = team->t.t_active_level;
+    if (master->th.th_teams_microtask) { // in teams construct?
+      if (master->th.th_teams_size.nteams > 1 &&
+          ( // #teams > 1
+              team->t.t_pkfn ==
+                  (microtask_t)__kmp_teams_master || // inner fork of the teams
+              master->th.th_teams_level <
+                  team->t.t_level)) { // or nested parallel inside the teams
+        ++level; // not increment if #teams==1, or for outer fork of the teams;
+        // increment otherwise
+      }
+    }
+    hot_teams = master->th.th_hot_teams;
+    if (level < __kmp_hot_teams_max_level && hot_teams &&
+        hot_teams[level]
+            .hot_team) { // hot team has already been allocated for given level
+      use_hot_team = 1;
+    } else {
+      use_hot_team = 0;
+    }
+  }
+#endif
+  // Optimization to use a "hot" team
+  if (use_hot_team && new_nproc > 1) {
+    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
+#if KMP_NESTED_HOT_TEAMS
+    team = hot_teams[level].hot_team;
+#else
+    team = root->r.r_hot_team;
+#endif
+#if KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
+                    "task_team[1] = %p before reinit\n",
+                    team->t.t_task_team[0], team->t.t_task_team[1]));
+    }
+#endif
+
+    // Has the number of threads changed?
+    /* Let's assume the most common case is that the number of threads is
+       unchanged, and put that case first. */
+    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
+      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
+      // This case can mean that omp_set_num_threads() was called and the hot
+      // team size was already reduced, so we check the special flag
+      if (team->t.t_size_changed == -1) {
+        team->t.t_size_changed = 1;
+      } else {
+        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
+      }
+
+      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+      kmp_r_sched_t new_sched = new_icvs->sched;
+      // set master's schedule as new run-time schedule
+      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
+
+      __kmp_reinitialize_team(team, new_icvs,
+                              root->r.r_uber_thread->th.th_ident);
+
+      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
+                    team->t.t_threads[0], team));
+      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
+
+#if KMP_AFFINITY_SUPPORTED
+      if ((team->t.t_size_changed == 0) &&
+          (team->t.t_proc_bind == new_proc_bind)) {
+        if (new_proc_bind == proc_bind_spread) {
+          __kmp_partition_places(
+              team, 1); // add flag to update only master for spread
+        }
+        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
+                       "proc_bind = %d, partition = [%d,%d]\n",
+                       team->t.t_id, new_proc_bind, team->t.t_first_place,
+                       team->t.t_last_place));
+      } else {
+        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+        __kmp_partition_places(team);
+      }
+#else
+      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+#endif /* KMP_AFFINITY_SUPPORTED */
+    } else if (team->t.t_nproc > new_nproc) {
+      KA_TRACE(20,
+               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
+                new_nproc));
+
+      team->t.t_size_changed = 1;
+#if KMP_NESTED_HOT_TEAMS
+      if (__kmp_hot_teams_mode == 0) {
+        // AC: saved number of threads should correspond to team's value in this
+        // mode, can be bigger in mode 1, when hot team has threads in reserve
+        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
+        hot_teams[level].hot_team_nth = new_nproc;
+#endif // KMP_NESTED_HOT_TEAMS
+        /* release the extra threads we don't need any more */
+        for (f = new_nproc; f < team->t.t_nproc; f++) {
+          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+          if (__kmp_tasking_mode != tskm_immediate_exec) {
+            // When decreasing team size, threads no longer in the team should
+            // unref task team.
+            team->t.t_threads[f]->th.th_task_team = NULL;
+          }
+          __kmp_free_thread(team->t.t_threads[f]);
+          team->t.t_threads[f] = NULL;
+        }
+#if KMP_NESTED_HOT_TEAMS
+      } // (__kmp_hot_teams_mode == 0)
+      else {
+        // When keeping extra threads in team, switch threads to wait on own
+        // b_go flag
+        for (f = new_nproc; f < team->t.t_nproc; ++f) {
+          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
+          for (int b = 0; b < bs_last_barrier; ++b) {
+            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
+              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
+            }
+            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
+          }
+        }
+      }
+#endif // KMP_NESTED_HOT_TEAMS
+      team->t.t_nproc = new_nproc;
+      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
+      __kmp_reinitialize_team(team, new_icvs,
+                              root->r.r_uber_thread->th.th_ident);
+
+      // Update remaining threads
+      for (f = 0; f < new_nproc; ++f) {
+        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
+      }
+
+      // restore the current task state of the master thread: should be the
+      // implicit task
+      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
+                    team->t.t_threads[0], team));
+
+      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
+
+#ifdef KMP_DEBUG
+      for (f = 0; f < team->t.t_nproc; f++) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
+                         team->t.t_threads[f]->th.th_team_nproc ==
+                             team->t.t_nproc);
+      }
+#endif
+
+      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+#if KMP_AFFINITY_SUPPORTED
+      __kmp_partition_places(team);
+#endif
+    } else { // team->t.t_nproc < new_nproc
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+      kmp_affin_mask_t *old_mask;
+      if (KMP_AFFINITY_CAPABLE()) {
+        KMP_CPU_ALLOC(old_mask);
+      }
+#endif
+
+      KA_TRACE(20,
+               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
+                new_nproc));
+
+      team->t.t_size_changed = 1;
+
+#if KMP_NESTED_HOT_TEAMS
+      int avail_threads = hot_teams[level].hot_team_nth;
+      if (new_nproc < avail_threads)
+        avail_threads = new_nproc;
+      kmp_info_t **other_threads = team->t.t_threads;
+      for (f = team->t.t_nproc; f < avail_threads; ++f) {
+        // Adjust barrier data of reserved threads (if any) of the team
+        // Other data will be set in __kmp_initialize_info() below.
+        int b;
+        kmp_balign_t *balign = other_threads[f]->th.th_bar;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+      }
+      if (hot_teams[level].hot_team_nth >= new_nproc) {
+        // we have all needed threads in reserve, no need to allocate any
+        // this only possible in mode 1, cannot have reserved threads in mode 0
+        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
+        team->t.t_nproc = new_nproc; // just get reserved threads involved
+      } else {
+        // we may have some threads in reserve, but not enough
+        team->t.t_nproc =
+            hot_teams[level]
+                .hot_team_nth; // get reserved threads involved if any
+        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
+#endif // KMP_NESTED_HOT_TEAMS
+        if (team->t.t_max_nproc < new_nproc) {
+          /* reallocate larger arrays */
+          __kmp_reallocate_team_arrays(team, new_nproc);
+          __kmp_reinitialize_team(team, new_icvs, NULL);
+        }
+
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+        /* Temporarily set full mask for master thread before creation of
+           workers. The reason is that workers inherit the affinity from master,
+           so if a lot of workers are created on the single core quickly, they
+           don't get a chance to set their own affinity for a long time. */
+        __kmp_set_thread_affinity_mask_full_tmp(old_mask);
+#endif
+
+        /* allocate new threads for the hot team */
+        for (f = team->t.t_nproc; f < new_nproc; f++) {
+          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
+          KMP_DEBUG_ASSERT(new_worker);
+          team->t.t_threads[f] = new_worker;
+
+          KA_TRACE(20,
+                   ("__kmp_allocate_team: team %d init T#%d arrived: "
+                    "join=%llu, plain=%llu\n",
+                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
+                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
+                    team->t.t_bar[bs_plain_barrier].b_arrived));
+
+          { // Initialize barrier data for new threads.
+            int b;
+            kmp_balign_t *balign = new_worker->th.th_bar;
+            for (b = 0; b < bs_last_barrier; ++b) {
+              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
+                               KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+            }
+          }
+        }
+
+#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+        if (KMP_AFFINITY_CAPABLE()) {
+          /* Restore initial master thread's affinity mask */
+          __kmp_set_system_affinity(old_mask, TRUE);
+          KMP_CPU_FREE(old_mask);
+        }
+#endif
+#if KMP_NESTED_HOT_TEAMS
+      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
+#endif // KMP_NESTED_HOT_TEAMS
+      /* make sure everyone is syncronized */
+      int old_nproc = team->t.t_nproc; // save old value and use to update only
+      // new threads below
+      __kmp_initialize_team(team, new_nproc, new_icvs,
+                            root->r.r_uber_thread->th.th_ident);
+
+      /* reinitialize the threads */
+      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
+      for (f = 0; f < team->t.t_nproc; ++f)
+        __kmp_initialize_info(team->t.t_threads[f], team, f,
+                              __kmp_gtid_from_tid(f, team));
+
+      if (level) { // set th_task_state for new threads in nested hot team
+        // __kmp_initialize_info() no longer zeroes th_task_state, so we should
+        // only need to set the th_task_state for the new threads. th_task_state
+        // for master thread will not be accurate until after this in
+        // __kmp_fork_call(), so we look to the master's memo_stack to get the
+        // correct value.
+        for (f = old_nproc; f < team->t.t_nproc; ++f)
+          team->t.t_threads[f]->th.th_task_state =
+              team->t.t_threads[0]->th.th_task_state_memo_stack[level];
+      } else { // set th_task_state for new threads in non-nested hot team
+        int old_state =
+            team->t.t_threads[0]->th.th_task_state; // copy master's state
+        for (f = old_nproc; f < team->t.t_nproc; ++f)
+          team->t.t_threads[f]->th.th_task_state = old_state;
+      }
+
+#ifdef KMP_DEBUG
+      for (f = 0; f < team->t.t_nproc; ++f) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
+                         team->t.t_threads[f]->th.th_team_nproc ==
+                             team->t.t_nproc);
+      }
+#endif
+
+      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+#if KMP_AFFINITY_SUPPORTED
+      __kmp_partition_places(team);
+#endif
+    } // Check changes in number of threads
+
+    kmp_info_t *master = team->t.t_threads[0];
+    if (master->th.th_teams_microtask) {
+      for (f = 1; f < new_nproc; ++f) {
+        // propagate teams construct specific info to workers
+        kmp_info_t *thr = team->t.t_threads[f];
+        thr->th.th_teams_microtask = master->th.th_teams_microtask;
+        thr->th.th_teams_level = master->th.th_teams_level;
+        thr->th.th_teams_size = master->th.th_teams_size;
+      }
+    }
+#if KMP_NESTED_HOT_TEAMS
+    if (level) {
+      // Sync barrier state for nested hot teams, not needed for outermost hot
+      // team.
+      for (f = 1; f < new_nproc; ++f) {
+        kmp_info_t *thr = team->t.t_threads[f];
+        int b;
+        kmp_balign_t *balign = thr->th.th_bar;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#if USE_DEBUGGER
+          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
+#endif
+        }
+      }
+    }
+#endif // KMP_NESTED_HOT_TEAMS
+
+    /* reallocate space for arguments if necessary */
+    __kmp_alloc_argv_entries(argc, team, TRUE);
+    KMP_CHECK_UPDATE(team->t.t_argc, argc);
+    // The hot team re-uses the previous task team,
+    // if untouched during the previous release->gather phase.
+
+    KF_TRACE(10, (" hot_team = %p\n", team));
+
+#if KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
+                    "task_team[1] = %p after reinit\n",
+                    team->t.t_task_team[0], team->t.t_task_team[1]));
+    }
+#endif
+
+#if OMPT_SUPPORT
+    __ompt_team_assign_id(team, ompt_parallel_data);
+#endif
+
+    KMP_MB();
+
+    return team;
+  }
+
+  /* next, let's try to take one from the team pool */
+  KMP_MB();
+  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
+    /* TODO: consider resizing undersized teams instead of reaping them, now
+       that we have a resizing mechanism */
+    if (team->t.t_max_nproc >= max_nproc) {
+      /* take this team from the team pool */
+      __kmp_team_pool = team->t.t_next_pool;
+
+      /* setup the team for fresh use */
+      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
+
+      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
+                    "task_team[1] %p to NULL\n",
+                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
+      team->t.t_task_team[0] = NULL;
+      team->t.t_task_team[1] = NULL;
+
+      /* reallocate space for arguments if necessary */
+      __kmp_alloc_argv_entries(argc, team, TRUE);
+      KMP_CHECK_UPDATE(team->t.t_argc, argc);
+
+      KA_TRACE(
+          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
+               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+      { // Initialize barrier data.
+        int b;
+        for (b = 0; b < bs_last_barrier; ++b) {
+          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+          team->t.t_bar[b].b_master_arrived = 0;
+          team->t.t_bar[b].b_team_arrived = 0;
+#endif
+        }
+      }
+
+      team->t.t_proc_bind = new_proc_bind;
+
+      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
+                    team->t.t_id));
+
+#if OMPT_SUPPORT
+      __ompt_team_assign_id(team, ompt_parallel_data);
+#endif
+
+      KMP_MB();
+
+      return team;
+    }
+
+    /* reap team if it is too small, then loop back and check the next one */
+    // not sure if this is wise, but, will be redone during the hot-teams
+    // rewrite.
+    /* TODO: Use technique to find the right size hot-team, don't reap them */
+    team = __kmp_reap_team(team);
+    __kmp_team_pool = team;
+  }
+
+  /* nothing available in the pool, no matter, make a new team! */
+  KMP_MB();
+  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
+
+  /* and set it up */
+  team->t.t_max_nproc = max_nproc;
+  /* NOTE well, for some reason allocating one big buffer and dividing it up
+     seems to really hurt performance a lot on the P4, so, let's not use this */
+  __kmp_allocate_team_arrays(team, max_nproc);
+
+  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
+  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
+
+  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
+                "%p to NULL\n",
+                &team->t.t_task_team[0], &team->t.t_task_team[1]));
+  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
+  // memory, no need to duplicate
+  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
+  // memory, no need to duplicate
+
+  if (__kmp_storage_map) {
+    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
+  }
+
+  /* allocate space for arguments */
+  __kmp_alloc_argv_entries(argc, team, FALSE);
+  team->t.t_argc = argc;
+
+  KA_TRACE(20,
+           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
+            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
+  { // Initialize barrier data.
+    int b;
+    for (b = 0; b < bs_last_barrier; ++b) {
+      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
+#if USE_DEBUGGER
+      team->t.t_bar[b].b_master_arrived = 0;
+      team->t.t_bar[b].b_team_arrived = 0;
+#endif
+    }
+  }
+
+  team->t.t_proc_bind = new_proc_bind;
+
+#if OMPT_SUPPORT
+  __ompt_team_assign_id(team, ompt_parallel_data);
+  team->t.ompt_serialized_team_info = NULL;
+#endif
+
+  KMP_MB();
+
+  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
+                team->t.t_id));
+
+  return team;
+}
+
+/* TODO implement hot-teams at all levels */
+/* TODO implement lazy thread release on demand (disband request) */
+
+/* free the team.  return it to the team pool.  release all the threads
+ * associated with it */
+void __kmp_free_team(kmp_root_t *root,
+                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
+  int f;
+  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
+                team->t.t_id));
+
+  /* verify state */
+  KMP_DEBUG_ASSERT(root);
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+
+  int use_hot_team = team == root->r.r_hot_team;
+#if KMP_NESTED_HOT_TEAMS
+  int level;
+  kmp_hot_team_ptr_t *hot_teams;
+  if (master) {
+    level = team->t.t_active_level - 1;
+    if (master->th.th_teams_microtask) { // in teams construct?
+      if (master->th.th_teams_size.nteams > 1) {
+        ++level; // level was not increased in teams construct for
+        // team_of_masters
+      }
+      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+          master->th.th_teams_level == team->t.t_level) {
+        ++level; // level was not increased in teams construct for
+        // team_of_workers before the parallel
+      } // team->t.t_level will be increased inside parallel
+    }
+    hot_teams = master->th.th_hot_teams;
+    if (level < __kmp_hot_teams_max_level) {
+      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
+      use_hot_team = 1;
+    }
+  }
+#endif // KMP_NESTED_HOT_TEAMS
+
+  /* team is done working */
+  TCW_SYNC_PTR(team->t.t_pkfn,
+               NULL); // Important for Debugging Support Library.
+#if KMP_OS_WINDOWS
+  team->t.t_copyin_counter = 0; // init counter for possible reuse
+#endif
+  // Do not reset pointer to parent team to NULL for hot teams.
+
+  /* if we are non-hot team, release our threads */
+  if (!use_hot_team) {
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Wait for threads to reach reapable state
+      for (f = 1; f < team->t.t_nproc; ++f) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+        kmp_info_t *th = team->t.t_threads[f];
+        volatile kmp_uint32 *state = &th->th.th_reap_state;
+        while (*state != KMP_SAFE_TO_REAP) {
+#if KMP_OS_WINDOWS
+          // On Windows a thread can be killed at any time, check this
+          DWORD ecode;
+          if (!__kmp_is_thread_alive(th, &ecode)) {
+            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
+            break;
+          }
+#endif
+          // first check if thread is sleeping
+          kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
+          if (fl.is_sleeping())
+            fl.resume(__kmp_gtid_from_thread(th));
+          KMP_CPU_PAUSE();
+        }
+      }
+
+      // Delete task teams
+      int tt_idx;
+      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
+        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
+        if (task_team != NULL) {
+          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
+            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+            team->t.t_threads[f]->th.th_task_team = NULL;
+          }
+          KA_TRACE(
+              20,
+              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
+               __kmp_get_gtid(), task_team, team->t.t_id));
+#if KMP_NESTED_HOT_TEAMS
+          __kmp_free_task_team(master, task_team);
+#endif
+          team->t.t_task_team[tt_idx] = NULL;
+        }
+      }
+    }
+
+    // Reset pointer to parent team only for non-hot teams.
+    team->t.t_parent = NULL;
+    team->t.t_level = 0;
+    team->t.t_active_level = 0;
+
+    /* free the worker threads */
+    for (f = 1; f < team->t.t_nproc; ++f) {
+      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
+      __kmp_free_thread(team->t.t_threads[f]);
+      team->t.t_threads[f] = NULL;
+    }
+
+    /* put the team back in the team pool */
+    /* TODO limit size of team pool, call reap_team if pool too large */
+    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
+    __kmp_team_pool = (volatile kmp_team_t *)team;
+  } else { // Check if team was created for the masters in a teams construct
+    // See if first worker is a CG root
+    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
+                     team->t.t_threads[1]->th.th_cg_roots);
+    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
+      // Clean up the CG root nodes on workers so that this team can be re-used
+      for (f = 1; f < team->t.t_nproc; ++f) {
+        kmp_info_t *thr = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
+                         thr->th.th_cg_roots->cg_root == thr);
+        // Pop current CG root off list
+        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
+        thr->th.th_cg_roots = tmp->up;
+        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
+                       " up to node %p. cg_nthreads was %d\n",
+                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
+        int i = tmp->cg_nthreads--;
+        if (i == 1) {
+          __kmp_free(tmp); // free CG if we are the last thread in it
+        }
+        // Restore current task's thread_limit from CG root
+        if (thr->th.th_cg_roots)
+          thr->th.th_current_task->td_icvs.thread_limit =
+              thr->th.th_cg_roots->cg_thread_limit;
+      }
+    }
+  }
+
+  KMP_MB();
+}
+
+/* reap the team.  destroy it, reclaim all its resources and free its memory */
+kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
+  kmp_team_t *next_pool = team->t.t_next_pool;
+
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(team->t.t_dispatch);
+  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
+  KMP_DEBUG_ASSERT(team->t.t_threads);
+  KMP_DEBUG_ASSERT(team->t.t_argv);
+
+  /* TODO clean the threads that are a part of this? */
+
+  /* free stuff */
+  __kmp_free_team_arrays(team);
+  if (team->t.t_argv != &team->t.t_inline_argv[0])
+    __kmp_free((void *)team->t.t_argv);
+  __kmp_free(team);
+
+  KMP_MB();
+  return next_pool;
+}
+
+// Free the thread.  Don't reap it, just place it on the pool of available
+// threads.
+//
+// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
+// binding for the affinity mechanism to be useful.
+//
+// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
+// However, we want to avoid a potential performance problem by always
+// scanning through the list to find the correct point at which to insert
+// the thread (potential N**2 behavior).  To do this we keep track of the
+// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
+// With single-level parallelism, threads will always be added to the tail
+// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
+// parallelism, all bets are off and we may need to scan through the entire
+// free list.
+//
+// This change also has a potentially large performance benefit, for some
+// applications.  Previously, as threads were freed from the hot team, they
+// would be placed back on the free list in inverse order.  If the hot team
+// grew back to it's original size, then the freed thread would be placed
+// back on the hot team in reverse order.  This could cause bad cache
+// locality problems on programs where the size of the hot team regularly
+// grew and shrunk.
+//
+// Now, for single-level parallelism, the OMP tid is alway == gtid.
+void __kmp_free_thread(kmp_info_t *this_th) {
+  int gtid;
+  kmp_info_t **scan;
+
+  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
+                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
+
+  KMP_DEBUG_ASSERT(this_th);
+
+  // When moving thread to pool, switch thread to wait on own b_go flag, and
+  // uninitialized (NULL team).
+  int b;
+  kmp_balign_t *balign = this_th->th.th_bar;
+  for (b = 0; b < bs_last_barrier; ++b) {
+    if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
+      balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
+    balign[b].bb.team = NULL;
+    balign[b].bb.leaf_kids = 0;
+  }
+  this_th->th.th_task_state = 0;
+  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
+
+  /* put thread back on the free pool */
+  TCW_PTR(this_th->th.th_team, NULL);
+  TCW_PTR(this_th->th.th_root, NULL);
+  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
+
+  while (this_th->th.th_cg_roots) {
+    this_th->th.th_cg_roots->cg_nthreads--;
+    KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
+                   " %p of thread  %p to %d\n",
+                   this_th, this_th->th.th_cg_roots,
+                   this_th->th.th_cg_roots->cg_root,
+                   this_th->th.th_cg_roots->cg_nthreads));
+    kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
+    if (tmp->cg_root == this_th) { // Thread is a cg_root
+      KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
+      KA_TRACE(
+          5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
+      this_th->th.th_cg_roots = tmp->up;
+      __kmp_free(tmp);
+    } else { // Worker thread
+      if (tmp->cg_nthreads == 0) { // last thread leaves contention group
+        __kmp_free(tmp);
+      }
+      this_th->th.th_cg_roots = NULL;
+      break;
+    }
+  }
+
+  /* If the implicit task assigned to this thread can be used by other threads
+   * -> multiple threads can share the data and try to free the task at
+   * __kmp_reap_thread at exit. This duplicate use of the task data can happen
+   * with higher probability when hot team is disabled but can occurs even when
+   * the hot team is enabled */
+  __kmp_free_implicit_task(this_th);
+  this_th->th.th_current_task = NULL;
+
+  // If the __kmp_thread_pool_insert_pt is already past the new insert
+  // point, then we need to re-scan the entire list.
+  gtid = this_th->th.th_info.ds.ds_gtid;
+  if (__kmp_thread_pool_insert_pt != NULL) {
+    KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
+    if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
+      __kmp_thread_pool_insert_pt = NULL;
+    }
+  }
+
+  // Scan down the list to find the place to insert the thread.
+  // scan is the address of a link in the list, possibly the address of
+  // __kmp_thread_pool itself.
+  //
+  // In the absence of nested parallism, the for loop will have 0 iterations.
+  if (__kmp_thread_pool_insert_pt != NULL) {
+    scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
+  } else {
+    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
+  }
+  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
+       scan = &((*scan)->th.th_next_pool))
+    ;
+
+  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
+  // to its address.
+  TCW_PTR(this_th->th.th_next_pool, *scan);
+  __kmp_thread_pool_insert_pt = *scan = this_th;
+  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
+                   (this_th->th.th_info.ds.ds_gtid <
+                    this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
+  TCW_4(this_th->th.th_in_pool, TRUE);
+  __kmp_suspend_initialize_thread(this_th);
+  __kmp_lock_suspend_mx(this_th);
+  if (this_th->th.th_active == TRUE) {
+    KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
+    this_th->th.th_active_in_pool = TRUE;
+  }
+#if KMP_DEBUG
+  else {
+    KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
+  }
+#endif
+  __kmp_unlock_suspend_mx(this_th);
+
+  TCW_4(__kmp_nth, __kmp_nth - 1);
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime back to user setting or default if necessary */
+  /* Middle initialization might never have occurred                */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
+    if (__kmp_nth <= __kmp_avail_proc) {
+      __kmp_zero_bt = FALSE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  KMP_MB();
+}
+
+/* ------------------------------------------------------------------------ */
+
+void *__kmp_launch_thread(kmp_info_t *this_thr) {
+  int gtid = this_thr->th.th_info.ds.ds_gtid;
+  /*    void                 *stack_data;*/
+  kmp_team_t *(*volatile pteam);
+
+  KMP_MB();
+  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
+
+  if (__kmp_env_consistency_check) {
+    this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
+  }
+
+#if OMPT_SUPPORT
+  ompt_data_t *thread_data;
+  if (ompt_enabled.enabled) {
+    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
+    *thread_data = ompt_data_none;
+
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    this_thr->th.ompt_thread_info.wait_id = 0;
+    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_worker, thread_data);
+    }
+  }
+#endif
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    this_thr->th.ompt_thread_info.state = ompt_state_idle;
+  }
+#endif
+  /* This is the place where threads wait for work */
+  while (!TCR_4(__kmp_global.g.g_done)) {
+    KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
+    KMP_MB();
+
+    /* wait for work to do */
+    KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
+
+    /* No tid yet since not part of a team */
+    __kmp_fork_barrier(gtid, KMP_GTID_DNE);
+
+#if OMPT_SUPPORT
+    if (ompt_enabled.enabled) {
+      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+#endif
+
+    pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
+
+    /* have we been allocated? */
+    if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
+      /* we were just woken up, so run our new task */
+      if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
+        int rc;
+        KA_TRACE(20,
+                 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
+                  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
+                  (*pteam)->t.t_pkfn));
+
+        updateHWFPControl(*pteam);
+
+#if OMPT_SUPPORT
+        if (ompt_enabled.enabled) {
+          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
+        }
+#endif
+
+        rc = (*pteam)->t.t_invoke(gtid);
+        KMP_ASSERT(rc);
+
+        KMP_MB();
+        KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
+                      gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
+                      (*pteam)->t.t_pkfn));
+      }
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        /* no frame set while outside task */
+        __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
+
+        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+      }
+#endif
+      /* join barrier after parallel region */
+      __kmp_join_barrier(gtid);
+    }
+  }
+  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.ompt_callback_thread_end) {
+    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
+  }
+#endif
+
+  this_thr->th.th_task_team = NULL;
+  /* run the destructors for the threadprivate data for this thread */
+  __kmp_common_destroy_gtid(gtid);
+
+  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
+  KMP_MB();
+  return this_thr;
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_internal_end_dest(void *specific_gtid) {
+#if KMP_COMPILER_ICC
+#pragma warning(push)
+#pragma warning(disable : 810) // conversion from "void *" to "int" may lose
+// significant bits
+#endif
+  // Make sure no significant bits are lost
+  int gtid = (kmp_intptr_t)specific_gtid - 1;
+#if KMP_COMPILER_ICC
+#pragma warning(pop)
+#endif
+
+  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
+  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
+   * this is because 0 is reserved for the nothing-stored case */
+
+  /* josh: One reason for setting the gtid specific data even when it is being
+     destroyed by pthread is to allow gtid lookup through thread specific data
+     (__kmp_gtid_get_specific).  Some of the code, especially stat code,
+     that gets executed in the call to __kmp_internal_end_thread, actually
+     gets the gtid through the thread specific data.  Setting it here seems
+     rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
+     to run smoothly.
+     todo: get rid of this after we remove the dependence on
+     __kmp_gtid_get_specific  */
+  if (gtid >= 0 && KMP_UBER_GTID(gtid))
+    __kmp_gtid_set_specific(gtid);
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = gtid;
+#endif
+  __kmp_internal_end_thread(gtid);
+}
+
+#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
+
+// 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
+// destructors work perfectly, but in real libomp.so I have no evidence it is
+// ever called. However, -fini linker option in makefile.mk works fine.
+
+__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
+  __kmp_internal_end_atexit();
+}
+
+void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
+
+#endif
+
+/* [Windows] josh: when the atexit handler is called, there may still be more
+   than one thread alive */
+void __kmp_internal_end_atexit(void) {
+  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
+  /* [Windows]
+     josh: ideally, we want to completely shutdown the library in this atexit
+     handler, but stat code that depends on thread specific data for gtid fails
+     because that data becomes unavailable at some point during the shutdown, so
+     we call __kmp_internal_end_thread instead. We should eventually remove the
+     dependency on __kmp_get_specific_gtid in the stat code and use
+     __kmp_internal_end_library to cleanly shutdown the library.
+
+     // TODO: Can some of this comment about GVS be removed?
+     I suspect that the offending stat code is executed when the calling thread
+     tries to clean up a dead root thread's data structures, resulting in GVS
+     code trying to close the GVS structures for that thread, but since the stat
+     code uses __kmp_get_specific_gtid to get the gtid with the assumption that
+     the calling thread is cleaning up itself instead of another thread, it get
+     confused. This happens because allowing a thread to unregister and cleanup
+     another thread is a recent modification for addressing an issue.
+     Based on the current design (20050722), a thread may end up
+     trying to unregister another thread only if thread death does not trigger
+     the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
+     thread specific data destructor function to detect thread death. For
+     Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
+     is nothing.  Thus, the workaround is applicable only for Windows static
+     stat library. */
+  __kmp_internal_end_library(-1);
+#if KMP_OS_WINDOWS
+  __kmp_close_console();
+#endif
+}
+
+static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
+  // It is assumed __kmp_forkjoin_lock is acquired.
+
+  int gtid;
+
+  KMP_DEBUG_ASSERT(thread != NULL);
+
+  gtid = thread->th.th_info.ds.ds_gtid;
+
+  if (!is_root) {
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+      /* Assume the threads are at the fork barrier here */
+      KA_TRACE(
+          20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
+               gtid));
+      /* Need release fence here to prevent seg faults for tree forkjoin barrier
+       * (GEH) */
+      ANNOTATE_HAPPENS_BEFORE(thread);
+      kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
+      __kmp_release_64(&flag);
+    }
+
+    // Terminate OS thread.
+    __kmp_reap_worker(thread);
+
+    // The thread was killed asynchronously.  If it was actively
+    // spinning in the thread pool, decrement the global count.
+    //
+    // There is a small timing hole here - if the worker thread was just waking
+    // up after sleeping in the pool, had reset it's th_active_in_pool flag but
+    // not decremented the global counter __kmp_thread_pool_active_nth yet, then
+    // the global counter might not get updated.
+    //
+    // Currently, this can only happen as the library is unloaded,
+    // so there are no harmful side effects.
+    if (thread->th.th_active_in_pool) {
+      thread->th.th_active_in_pool = FALSE;
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
+    }
+  }
+
+  __kmp_free_implicit_task(thread);
+
+// Free the fast memory for tasking
+#if USE_FAST_MEMORY
+  __kmp_free_fast_memory(thread);
+#endif /* USE_FAST_MEMORY */
+
+  __kmp_suspend_uninitialize_thread(thread);
+
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
+  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
+
+  --__kmp_all_nth;
+// __kmp_nth was decremented when thread is added to the pool.
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime back to user setting or default if necessary */
+  /* Middle initialization might never have occurred                */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
+    if (__kmp_nth <= __kmp_avail_proc) {
+      __kmp_zero_bt = FALSE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  /* free the memory being used */
+  if (__kmp_env_consistency_check) {
+    if (thread->th.th_cons) {
+      __kmp_free_cons_stack(thread->th.th_cons);
+      thread->th.th_cons = NULL;
+    }
+  }
+
+  if (thread->th.th_pri_common != NULL) {
+    __kmp_free(thread->th.th_pri_common);
+    thread->th.th_pri_common = NULL;
+  }
+
+  if (thread->th.th_task_state_memo_stack != NULL) {
+    __kmp_free(thread->th.th_task_state_memo_stack);
+    thread->th.th_task_state_memo_stack = NULL;
+  }
+
+#if KMP_USE_BGET
+  if (thread->th.th_local.bget_data != NULL) {
+    __kmp_finalize_bget(thread);
+  }
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  if (thread->th.th_affin_mask != NULL) {
+    KMP_CPU_FREE(thread->th.th_affin_mask);
+    thread->th.th_affin_mask = NULL;
+  }
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+#if KMP_USE_HIER_SCHED
+  if (thread->th.th_hier_bar_data != NULL) {
+    __kmp_free(thread->th.th_hier_bar_data);
+    thread->th.th_hier_bar_data = NULL;
+  }
+#endif
+
+  __kmp_reap_team(thread->th.th_serial_team);
+  thread->th.th_serial_team = NULL;
+  __kmp_free(thread);
+
+  KMP_MB();
+
+} // __kmp_reap_thread
+
+static void __kmp_internal_end(void) {
+  int i;
+
+  /* First, unregister the library */
+  __kmp_unregister_library();
+
+#if KMP_OS_WINDOWS
+  /* In Win static library, we can't tell when a root actually dies, so we
+     reclaim the data structures for any root threads that have died but not
+     unregistered themselves, in order to shut down cleanly.
+     In Win dynamic library we also can't tell when a thread dies.  */
+  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
+// dead roots
+#endif
+
+  for (i = 0; i < __kmp_threads_capacity; i++)
+    if (__kmp_root[i])
+      if (__kmp_root[i]->r.r_active)
+        break;
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+
+  if (i < __kmp_threads_capacity) {
+#if KMP_USE_MONITOR
+    // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    // Need to check that monitor was initialized before reaping it. If we are
+    // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
+    // __kmp_monitor will appear to contain valid data, but it is only valid in
+    // the parent process, not the child.
+    // New behavior (201008): instead of keying off of the flag
+    // __kmp_init_parallel, the monitor thread creation is keyed off
+    // of the new flag __kmp_init_monitor.
+    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
+    if (TCR_4(__kmp_init_monitor)) {
+      __kmp_reap_monitor(&__kmp_monitor);
+      TCW_4(__kmp_init_monitor, 0);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
+    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
+#endif // KMP_USE_MONITOR
+  } else {
+/* TODO move this to cleanup code */
+#ifdef KMP_DEBUG
+    /* make sure that everything has properly ended */
+    for (i = 0; i < __kmp_threads_capacity; i++) {
+      if (__kmp_root[i]) {
+        //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
+        //                    there can be uber threads alive here
+        KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
+      }
+    }
+#endif
+
+    KMP_MB();
+
+    // Reap the worker threads.
+    // This is valid for now, but be careful if threads are reaped sooner.
+    while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
+      // Get the next thread from the pool.
+      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
+      __kmp_thread_pool = thread->th.th_next_pool;
+      // Reap it.
+      KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
+      thread->th.th_next_pool = NULL;
+      thread->th.th_in_pool = FALSE;
+      __kmp_reap_thread(thread, 0);
+    }
+    __kmp_thread_pool_insert_pt = NULL;
+
+    // Reap teams.
+    while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
+      // Get the next team from the pool.
+      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
+      __kmp_team_pool = team->t.t_next_pool;
+      // Reap it.
+      team->t.t_next_pool = NULL;
+      __kmp_reap_team(team);
+    }
+
+    __kmp_reap_task_teams();
+
+#if KMP_OS_UNIX
+    // Threads that are not reaped should not access any resources since they
+    // are going to be deallocated soon, so the shutdown sequence should wait
+    // until all threads either exit the final spin-waiting loop or begin
+    // sleeping after the given blocktime.
+    for (i = 0; i < __kmp_threads_capacity; i++) {
+      kmp_info_t *thr = __kmp_threads[i];
+      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
+        KMP_CPU_PAUSE();
+    }
+#endif
+
+    for (i = 0; i < __kmp_threads_capacity; ++i) {
+      // TBD: Add some checking...
+      // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
+    }
+
+    /* Make sure all threadprivate destructors get run by joining with all
+       worker threads before resetting this flag */
+    TCW_SYNC_4(__kmp_init_common, FALSE);
+
+    KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
+    KMP_MB();
+
+#if KMP_USE_MONITOR
+    // See note above: One of the possible fixes for CQ138434 / CQ140126
+    //
+    // FIXME: push both code fragments down and CSE them?
+    // push them into __kmp_cleanup() ?
+    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
+    if (TCR_4(__kmp_init_monitor)) {
+      __kmp_reap_monitor(&__kmp_monitor);
+      TCW_4(__kmp_init_monitor, 0);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
+    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
+#endif
+  } /* else !__kmp_global.t_active */
+  TCW_4(__kmp_init_gtid, FALSE);
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  __kmp_cleanup();
+#if OMPT_SUPPORT
+  ompt_fini();
+#endif
+}
+
+void __kmp_internal_end_library(int gtid_req) {
+  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
+  /* this shouldn't be a race condition because __kmp_internal_end() is the
+     only place to clear __kmp_serial_init */
+  /* we'll check this later too, after we get the lock */
+  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
+  // redundaant, because the next check will work in any case.
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
+    /* TODO abort? */
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
+    return;
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  /* find out who we are and what we should do */
+  {
+    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
+    KA_TRACE(
+        10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
+    if (gtid == KMP_GTID_SHUTDOWN) {
+      KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
+                    "already shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_MONITOR) {
+      KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
+                    "registered, or system shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_DNE) {
+      KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
+                    "shutdown\n"));
+      /* we don't know who we are, but we may still shutdown the library */
+    } else if (KMP_UBER_GTID(gtid)) {
+      /* unregister ourselves as an uber thread.  gtid is no longer valid */
+      if (__kmp_root[gtid]->r.r_active) {
+        __kmp_global.g.g_abort = -1;
+        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+        KA_TRACE(10,
+                 ("__kmp_internal_end_library: root still active, abort T#%d\n",
+                  gtid));
+        return;
+      } else {
+        KA_TRACE(
+            10,
+            ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
+        __kmp_unregister_root_current_thread(gtid);
+      }
+    } else {
+/* worker threads may call this function through the atexit handler, if they
+ * call exit() */
+/* For now, skip the usual subsequent processing and just dump the debug buffer.
+   TODO: do a thorough shutdown instead */
+#ifdef DUMP_DEBUG_ON_EXIT
+      if (__kmp_debug_buf)
+        __kmp_dump_debug_buffer();
+#endif
+      return;
+    }
+  }
+  /* synchronize the termination process */
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+
+  /* have we already finished */
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
+    /* TODO abort? */
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+  /* We need this lock to enforce mutex between this reading of
+     __kmp_threads_capacity and the writing by __kmp_register_root.
+     Alternatively, we can use a counter of roots that is atomically updated by
+     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
+     __kmp_internal_end_*.  */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  /* now we can safely conduct the actual termination */
+  __kmp_internal_end();
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+
+  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
+
+#ifdef DUMP_DEBUG_ON_EXIT
+  if (__kmp_debug_buf)
+    __kmp_dump_debug_buffer();
+#endif
+
+#if KMP_OS_WINDOWS
+  __kmp_close_console();
+#endif
+
+  __kmp_fini_allocator();
+
+} // __kmp_internal_end_library
+
+void __kmp_internal_end_thread(int gtid_req) {
+  int i;
+
+  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
+  /* this shouldn't be a race condition because __kmp_internal_end() is the
+   * only place to clear __kmp_serial_init */
+  /* we'll check this later too, after we get the lock */
+  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
+  // redundant, because the next check will work in any case.
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
+    /* TODO abort? */
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
+    return;
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  /* find out who we are and what we should do */
+  {
+    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
+    KA_TRACE(10,
+             ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
+    if (gtid == KMP_GTID_SHUTDOWN) {
+      KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
+                    "already shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_MONITOR) {
+      KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
+                    "registered, or system shutdown\n"));
+      return;
+    } else if (gtid == KMP_GTID_DNE) {
+      KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
+                    "shutdown\n"));
+      return;
+      /* we don't know who we are */
+    } else if (KMP_UBER_GTID(gtid)) {
+      /* unregister ourselves as an uber thread.  gtid is no longer valid */
+      if (__kmp_root[gtid]->r.r_active) {
+        __kmp_global.g.g_abort = -1;
+        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
+        KA_TRACE(10,
+                 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
+                  gtid));
+        return;
+      } else {
+        KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
+                      gtid));
+        __kmp_unregister_root_current_thread(gtid);
+      }
+    } else {
+      /* just a worker thread, let's leave */
+      KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
+
+      if (gtid >= 0) {
+        __kmp_threads[gtid]->th.th_task_team = NULL;
+      }
+
+      KA_TRACE(10,
+               ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
+                gtid));
+      return;
+    }
+  }
+#if KMP_DYNAMIC_LIB
+  if (__kmp_pause_status != kmp_hard_paused)
+  // AC: lets not shutdown the dynamic library at the exit of uber thread,
+  // because we will better shutdown later in the library destructor.
+  {
+    KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
+    return;
+  }
+#endif
+  /* synchronize the termination process */
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+
+  /* have we already finished */
+  if (__kmp_global.g.g_abort) {
+    KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
+    /* TODO abort? */
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+  /* We need this lock to enforce mutex between this reading of
+     __kmp_threads_capacity and the writing by __kmp_register_root.
+     Alternatively, we can use a counter of roots that is atomically updated by
+     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
+     __kmp_internal_end_*.  */
+
+  /* should we finish the run-time?  are all siblings done? */
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+
+  for (i = 0; i < __kmp_threads_capacity; ++i) {
+    if (KMP_UBER_GTID(i)) {
+      KA_TRACE(
+          10,
+          ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
+      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+      return;
+    }
+  }
+
+  /* now we can safely conduct the actual termination */
+
+  __kmp_internal_end();
+
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+
+  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
+
+#ifdef DUMP_DEBUG_ON_EXIT
+  if (__kmp_debug_buf)
+    __kmp_dump_debug_buffer();
+#endif
+} // __kmp_internal_end_thread
+
+// -----------------------------------------------------------------------------
+// Library registration stuff.
+
+static long __kmp_registration_flag = 0;
+// Random value used to indicate library initialization.
+static char *__kmp_registration_str = NULL;
+// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
+
+static inline char *__kmp_reg_status_name() {
+  /* On RHEL 3u5 if linked statically, getpid() returns different values in
+     each thread. If registration and unregistration go in different threads
+     (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
+     env var can not be found, because the name will contain different pid. */
+  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
+} // __kmp_reg_status_get
+
+void __kmp_register_library_startup(void) {
+
+  char *name = __kmp_reg_status_name(); // Name of the environment variable.
+  int done = 0;
+  union {
+    double dtime;
+    long ltime;
+  } time;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  __kmp_initialize_system_tick();
+#endif
+  __kmp_read_system_time(&time.dtime);
+  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
+  __kmp_registration_str =
+      __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
+                       __kmp_registration_flag, KMP_LIBRARY_FILE);
+
+  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
+                __kmp_registration_str));
+
+  while (!done) {
+
+    char *value = NULL; // Actual value of the environment variable.
+
+    // Set environment variable, but do not overwrite if it is exist.
+    __kmp_env_set(name, __kmp_registration_str, 0);
+    // Check the variable is written.
+    value = __kmp_env_get(name);
+    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
+
+      done = 1; // Ok, environment variable set successfully, exit the loop.
+
+    } else {
+
+      // Oops. Write failed. Another copy of OpenMP RTL is in memory.
+      // Check whether it alive or dead.
+      int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
+      char *tail = value;
+      char *flag_addr_str = NULL;
+      char *flag_val_str = NULL;
+      char const *file_name = NULL;
+      __kmp_str_split(tail, '-', &flag_addr_str, &tail);
+      __kmp_str_split(tail, '-', &flag_val_str, &tail);
+      file_name = tail;
+      if (tail != NULL) {
+        long *flag_addr = 0;
+        long flag_val = 0;
+        KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
+        KMP_SSCANF(flag_val_str, "%lx", &flag_val);
+        if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
+          // First, check whether environment-encoded address is mapped into
+          // addr space.
+          // If so, dereference it to see if it still has the right value.
+          if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
+            neighbor = 1;
+          } else {
+            // If not, then we know the other copy of the library is no longer
+            // running.
+            neighbor = 2;
+          }
+        }
+      }
+      switch (neighbor) {
+      case 0: // Cannot parse environment variable -- neighbor status unknown.
+        // Assume it is the incompatible format of future version of the
+        // library. Assume the other library is alive.
+        // WARN( ... ); // TODO: Issue a warning.
+        file_name = "unknown library";
+        KMP_FALLTHROUGH();
+      // Attention! Falling to the next case. That's intentional.
+      case 1: { // Neighbor is alive.
+        // Check it is allowed.
+        char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
+        if (!__kmp_str_match_true(duplicate_ok)) {
+          // That's not allowed. Issue fatal error.
+          __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
+                      KMP_HNT(DuplicateLibrary), __kmp_msg_null);
+        }
+        KMP_INTERNAL_FREE(duplicate_ok);
+        __kmp_duplicate_library_ok = 1;
+        done = 1; // Exit the loop.
+      } break;
+      case 2: { // Neighbor is dead.
+        // Clear the variable and try to register library again.
+        __kmp_env_unset(name);
+      } break;
+      default: { KMP_DEBUG_ASSERT(0); } break;
+      }
+    }
+    KMP_INTERNAL_FREE((void *)value);
+  }
+  KMP_INTERNAL_FREE((void *)name);
+
+} // func __kmp_register_library_startup
+
+void __kmp_unregister_library(void) {
+
+  char *name = __kmp_reg_status_name();
+  char *value = __kmp_env_get(name);
+
+  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
+  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
+  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
+    // Ok, this is our variable. Delete it.
+    __kmp_env_unset(name);
+  }
+
+  KMP_INTERNAL_FREE(__kmp_registration_str);
+  KMP_INTERNAL_FREE(value);
+  KMP_INTERNAL_FREE(name);
+
+  __kmp_registration_flag = 0;
+  __kmp_registration_str = NULL;
+
+} // __kmp_unregister_library
+
+// End of Library registration stuff.
+// -----------------------------------------------------------------------------
+
+#if KMP_MIC_SUPPORTED
+
+static void __kmp_check_mic_type() {
+  kmp_cpuid_t cpuid_state = {0};
+  kmp_cpuid_t *cs_p = &cpuid_state;
+  __kmp_x86_cpuid(1, 0, cs_p);
+  // We don't support mic1 at the moment
+  if ((cs_p->eax & 0xff0) == 0xB10) {
+    __kmp_mic_type = mic2;
+  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
+    __kmp_mic_type = mic3;
+  } else {
+    __kmp_mic_type = non_mic;
+  }
+}
+
+#endif /* KMP_MIC_SUPPORTED */
+
+static void __kmp_do_serial_initialize(void) {
+  int i, gtid;
+  int size;
+
+  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
+
+  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
+  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
+  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
+  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
+  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
+
+#if OMPT_SUPPORT
+  ompt_pre_init();
+#endif
+
+  __kmp_validate_locks();
+
+  /* Initialize internal memory allocator */
+  __kmp_init_allocator();
+
+  /* Register the library startup via an environment variable and check to see
+     whether another copy of the library is already registered. */
+
+  __kmp_register_library_startup();
+
+  /* TODO reinitialization of library */
+  if (TCR_4(__kmp_global.g.g_done)) {
+    KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
+  }
+
+  __kmp_global.g.g_abort = 0;
+  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
+
+/* initialize the locks */
+#if KMP_USE_ADAPTIVE_LOCKS
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_init_speculative_stats();
+#endif
+#endif
+#if KMP_STATS_ENABLED
+  __kmp_stats_init();
+#endif
+  __kmp_init_lock(&__kmp_global_lock);
+  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
+  __kmp_init_lock(&__kmp_debug_lock);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
+  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
+  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
+  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
+#if KMP_USE_MONITOR
+  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
+#endif
+  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
+
+  /* conduct initialization and initial setup of configuration */
+
+  __kmp_runtime_initialize();
+
+#if KMP_MIC_SUPPORTED
+  __kmp_check_mic_type();
+#endif
+
+// Some global variable initialization moved here from kmp_env_initialize()
+#ifdef KMP_DEBUG
+  kmp_diag = 0;
+#endif
+  __kmp_abort_delay = 0;
+
+  // From __kmp_init_dflt_team_nth()
+  /* assume the entire machine will be used */
+  __kmp_dflt_team_nth_ub = __kmp_xproc;
+  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
+    __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
+  }
+  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
+    __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
+  }
+  __kmp_max_nth = __kmp_sys_max_nth;
+  __kmp_cg_max_nth = __kmp_sys_max_nth;
+  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
+  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
+    __kmp_teams_max_nth = __kmp_sys_max_nth;
+  }
+
+  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
+  // part
+  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+#if KMP_USE_MONITOR
+  __kmp_monitor_wakeups =
+      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+  __kmp_bt_intervals =
+      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+#endif
+  // From "KMP_LIBRARY" part of __kmp_env_initialize()
+  __kmp_library = library_throughput;
+  // From KMP_SCHEDULE initialization
+  __kmp_static = kmp_sch_static_balanced;
+// AC: do not use analytical here, because it is non-monotonous
+//__kmp_guided = kmp_sch_guided_iterative_chunked;
+//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
+// need to repeat assignment
+// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
+// bit control and barrier method control parts
+#if KMP_FAST_REDUCTION_BARRIER
+#define kmp_reduction_barrier_gather_bb ((int)1)
+#define kmp_reduction_barrier_release_bb ((int)1)
+#define kmp_reduction_barrier_gather_pat bp_hyper_bar
+#define kmp_reduction_barrier_release_pat bp_hyper_bar
+#endif // KMP_FAST_REDUCTION_BARRIER
+  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
+    __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
+    __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
+    __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
+#if KMP_FAST_REDUCTION_BARRIER
+    if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
+      // lin_64 ): hyper,1
+      __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
+      __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
+      __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
+      __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
+    }
+#endif // KMP_FAST_REDUCTION_BARRIER
+  }
+#if KMP_FAST_REDUCTION_BARRIER
+#undef kmp_reduction_barrier_release_pat
+#undef kmp_reduction_barrier_gather_pat
+#undef kmp_reduction_barrier_release_bb
+#undef kmp_reduction_barrier_gather_bb
+#endif // KMP_FAST_REDUCTION_BARRIER
+#if KMP_MIC_SUPPORTED
+  if (__kmp_mic_type == mic2) { // KNC
+    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
+    __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
+    __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
+        1; // forkjoin release
+    __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
+    __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
+  }
+#if KMP_FAST_REDUCTION_BARRIER
+  if (__kmp_mic_type == mic2) { // KNC
+    __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
+    __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
+  }
+#endif // KMP_FAST_REDUCTION_BARRIER
+#endif // KMP_MIC_SUPPORTED
+
+// From KMP_CHECKS initialization
+#ifdef KMP_DEBUG
+  __kmp_env_checks = TRUE; /* development versions have the extra checks */
+#else
+  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
+#endif
+
+  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
+  __kmp_foreign_tp = TRUE;
+
+  __kmp_global.g.g_dynamic = FALSE;
+  __kmp_global.g.g_dynamic_mode = dynamic_default;
+
+  __kmp_env_initialize(NULL);
+
+// Print all messages in message catalog for testing purposes.
+#ifdef KMP_DEBUG
+  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
+  if (__kmp_str_match_true(val)) {
+    kmp_str_buf_t buffer;
+    __kmp_str_buf_init(&buffer);
+    __kmp_i18n_dump_catalog(&buffer);
+    __kmp_printf("%s", buffer.str);
+    __kmp_str_buf_free(&buffer);
+  }
+  __kmp_env_free(&val);
+#endif
+
+  __kmp_threads_capacity =
+      __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
+  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
+  __kmp_tp_capacity = __kmp_default_tp_capacity(
+      __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
+
+  // If the library is shut down properly, both pools must be NULL. Just in
+  // case, set them to NULL -- some memory may leak, but subsequent code will
+  // work even if pools are not freed.
+  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
+  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
+  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
+  __kmp_thread_pool = NULL;
+  __kmp_thread_pool_insert_pt = NULL;
+  __kmp_team_pool = NULL;
+
+  /* Allocate all of the variable sized records */
+  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
+   * expandable */
+  /* Since allocation is cache-aligned, just add extra padding at the end */
+  size =
+      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
+      CACHE_LINE;
+  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
+  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
+                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
+
+  /* init thread counts */
+  KMP_DEBUG_ASSERT(__kmp_all_nth ==
+                   0); // Asserts fail if the library is reinitializing and
+  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
+  __kmp_all_nth = 0;
+  __kmp_nth = 0;
+
+  /* setup the uber master thread and hierarchy */
+  gtid = __kmp_register_root(TRUE);
+  KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  __kmp_common_initialize();
+
+#if KMP_OS_UNIX
+  /* invoke the child fork handler */
+  __kmp_register_atfork();
+#endif
+
+#if !KMP_DYNAMIC_LIB
+  {
+    /* Invoke the exit handler when the program finishes, only for static
+       library. For dynamic library, we already have _fini and DllMain. */
+    int rc = atexit(__kmp_internal_end_atexit);
+    if (rc != 0) {
+      __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
+                  __kmp_msg_null);
+    }
+  }
+#endif
+
+#if KMP_HANDLE_SIGNALS
+#if KMP_OS_UNIX
+  /* NOTE: make sure that this is called before the user installs their own
+     signal handlers so that the user handlers are called first. this way they
+     can return false, not call our handler, avoid terminating the library, and
+     continue execution where they left off. */
+  __kmp_install_signals(FALSE);
+#endif /* KMP_OS_UNIX */
+#if KMP_OS_WINDOWS
+  __kmp_install_signals(TRUE);
+#endif /* KMP_OS_WINDOWS */
+#endif
+
+  /* we have finished the serial initialization */
+  __kmp_init_counter++;
+
+  __kmp_init_serial = TRUE;
+
+  if (__kmp_settings) {
+    __kmp_env_print();
+  }
+
+  if (__kmp_display_env || __kmp_display_env_verbose) {
+    __kmp_env_print_2();
+  }
+
+#if OMPT_SUPPORT
+  ompt_post_init();
+#endif
+
+  KMP_MB();
+
+  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
+}
+
+void __kmp_serial_initialize(void) {
+  if (__kmp_init_serial) {
+    return;
+  }
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (__kmp_init_serial) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  __kmp_do_serial_initialize();
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+static void __kmp_do_middle_initialize(void) {
+  int i, j;
+  int prev_dflt_team_nth;
+
+  if (!__kmp_init_serial) {
+    __kmp_do_serial_initialize();
+  }
+
+  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
+
+  // Save the previous value for the __kmp_dflt_team_nth so that
+  // we can avoid some reinitialization if it hasn't changed.
+  prev_dflt_team_nth = __kmp_dflt_team_nth;
+
+#if KMP_AFFINITY_SUPPORTED
+  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
+  // number of cores on the machine.
+  __kmp_affinity_initialize();
+
+  // Run through the __kmp_threads array and set the affinity mask
+  // for each root thread that is currently registered with the RTL.
+  for (i = 0; i < __kmp_threads_capacity; i++) {
+    if (TCR_PTR(__kmp_threads[i]) != NULL) {
+      __kmp_affinity_set_init_mask(i, TRUE);
+    }
+  }
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+  KMP_ASSERT(__kmp_xproc > 0);
+  if (__kmp_avail_proc == 0) {
+    __kmp_avail_proc = __kmp_xproc;
+  }
+
+  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
+  // correct them now
+  j = 0;
+  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
+    __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
+        __kmp_avail_proc;
+    j++;
+  }
+
+  if (__kmp_dflt_team_nth == 0) {
+#ifdef KMP_DFLT_NTH_CORES
+    // Default #threads = #cores
+    __kmp_dflt_team_nth = __kmp_ncores;
+    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
+                  "__kmp_ncores (%d)\n",
+                  __kmp_dflt_team_nth));
+#else
+    // Default #threads = #available OS procs
+    __kmp_dflt_team_nth = __kmp_avail_proc;
+    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
+                  "__kmp_avail_proc(%d)\n",
+                  __kmp_dflt_team_nth));
+#endif /* KMP_DFLT_NTH_CORES */
+  }
+
+  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
+    __kmp_dflt_team_nth = KMP_MIN_NTH;
+  }
+  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
+    __kmp_dflt_team_nth = __kmp_sys_max_nth;
+  }
+
+  // There's no harm in continuing if the following check fails,
+  // but it indicates an error in the previous logic.
+  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
+
+  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
+    // Run through the __kmp_threads array and set the num threads icv for each
+    // root thread that is currently registered with the RTL (which has not
+    // already explicitly set its nthreads-var with a call to
+    // omp_set_num_threads()).
+    for (i = 0; i < __kmp_threads_capacity; i++) {
+      kmp_info_t *thread = __kmp_threads[i];
+      if (thread == NULL)
+        continue;
+      if (thread->th.th_current_task->td_icvs.nproc != 0)
+        continue;
+
+      set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
+    }
+  }
+  KA_TRACE(
+      20,
+      ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
+       __kmp_dflt_team_nth));
+
+#ifdef KMP_ADJUST_BLOCKTIME
+  /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
+  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
+    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
+    if (__kmp_nth > __kmp_avail_proc) {
+      __kmp_zero_bt = TRUE;
+    }
+  }
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+  /* we have finished middle initialization */
+  TCW_SYNC_4(__kmp_init_middle, TRUE);
+
+  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
+}
+
+void __kmp_middle_initialize(void) {
+  if (__kmp_init_middle) {
+    return;
+  }
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (__kmp_init_middle) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+  __kmp_do_middle_initialize();
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+void __kmp_parallel_initialize(void) {
+  int gtid = __kmp_entry_gtid(); // this might be a new root
+
+  /* synchronize parallel initialization (for sibling) */
+  if (TCR_4(__kmp_init_parallel))
+    return;
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  if (TCR_4(__kmp_init_parallel)) {
+    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+    return;
+  }
+
+  /* TODO reinitialization after we have already shut down */
+  if (TCR_4(__kmp_global.g.g_done)) {
+    KA_TRACE(
+        10,
+        ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
+    __kmp_infinite_loop();
+  }
+
+  /* jc: The lock __kmp_initz_lock is already held, so calling
+     __kmp_serial_initialize would cause a deadlock.  So we call
+     __kmp_do_serial_initialize directly. */
+  if (!__kmp_init_middle) {
+    __kmp_do_middle_initialize();
+  }
+  __kmp_resume_if_hard_paused();
+
+  /* begin initialization */
+  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  // Save the FP control regs.
+  // Worker threads will set theirs to these values at thread startup.
+  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
+  __kmp_store_mxcsr(&__kmp_init_mxcsr);
+  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_OS_UNIX
+#if KMP_HANDLE_SIGNALS
+  /*  must be after __kmp_serial_initialize  */
+  __kmp_install_signals(TRUE);
+#endif
+#endif
+
+  __kmp_suspend_initialize();
+
+#if defined(USE_LOAD_BALANCE)
+  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
+    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
+  }
+#else
+  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
+    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+  }
+#endif
+
+  if (__kmp_version) {
+    __kmp_print_version_2();
+  }
+
+  /* we have finished parallel initialization */
+  TCW_SYNC_4(__kmp_init_parallel, TRUE);
+
+  KMP_MB();
+  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
+
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
+                                   kmp_team_t *team) {
+  kmp_disp_t *dispatch;
+
+  KMP_MB();
+
+  /* none of the threads have encountered any constructs, yet. */
+  this_thr->th.th_local.this_construct = 0;
+#if KMP_CACHE_MANAGE
+  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
+  KMP_DEBUG_ASSERT(dispatch);
+  KMP_DEBUG_ASSERT(team->t.t_dispatch);
+  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
+  // this_thr->th.th_info.ds.ds_tid ] );
+
+  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
+  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
+  if (__kmp_env_consistency_check)
+    __kmp_push_parallel(gtid, team->t.t_ident);
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+
+void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
+                                  kmp_team_t *team) {
+  if (__kmp_env_consistency_check)
+    __kmp_pop_parallel(gtid, team->t.t_ident);
+
+  __kmp_finish_implicit_task(this_thr);
+}
+
+int __kmp_invoke_task_func(int gtid) {
+  int rc;
+  int tid = __kmp_tid_from_gtid(gtid);
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+
+  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
+#if USE_ITT_BUILD
+  if (__itt_stack_caller_create_ptr) {
+    __kmp_itt_stack_callee_enter(
+        (__itt_caller)
+            team->t.t_stack_id); // inform ittnotify about entering user's code
+  }
+#endif /* USE_ITT_BUILD */
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_INVOKING();
+#endif
+
+#if OMPT_SUPPORT
+  void *dummy;
+  void **exit_runtime_p;
+  ompt_data_t *my_task_data;
+  ompt_data_t *my_parallel_data;
+  int ompt_team_size;
+
+  if (ompt_enabled.enabled) {
+    exit_runtime_p = &(
+        team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
+  } else {
+    exit_runtime_p = &dummy;
+  }
+
+  my_task_data =
+      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
+  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
+  if (ompt_enabled.ompt_callback_implicit_task) {
+    ompt_team_size = team->t.t_nproc;
+    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
+        __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+    OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
+  }
+#endif
+
+#if KMP_STATS_ENABLED
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::TEAMS_REGION) {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
+  }
+  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
+#endif
+
+  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
+                              tid, (int)team->t.t_argc, (void **)team->t.t_argv
+#if OMPT_SUPPORT
+                              ,
+                              exit_runtime_p
+#endif
+                              );
+#if OMPT_SUPPORT
+  *exit_runtime_p = NULL;
+#endif
+
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::TEAMS_REGION) {
+    KMP_SET_THREAD_STATE(previous_state);
+  }
+  KMP_POP_PARTITIONED_TIMER();
+#endif
+
+#if USE_ITT_BUILD
+  if (__itt_stack_caller_create_ptr) {
+    __kmp_itt_stack_callee_leave(
+        (__itt_caller)
+            team->t.t_stack_id); // inform ittnotify about leaving user's code
+  }
+#endif /* USE_ITT_BUILD */
+  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
+
+  return rc;
+}
+
+void __kmp_teams_master(int gtid) {
+  // This routine is called by all master threads in teams construct
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_team_t *team = thr->th.th_team;
+  ident_t *loc = team->t.t_ident;
+  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
+  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
+  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
+  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
+                __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
+
+  // This thread is a new CG root.  Set up the proper variables.
+  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
+  tmp->cg_root = thr; // Make thr the CG root
+  // Init to thread limit that was stored when league masters were forked
+  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
+  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
+  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
+                 " cg_nthreads to 1\n",
+                 thr, tmp));
+  tmp->up = thr->th.th_cg_roots;
+  thr->th.th_cg_roots = tmp;
+
+// Launch league of teams now, but not let workers execute
+// (they hang on fork barrier until next parallel)
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_FORKING();
+#endif
+  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
+                  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
+                  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_JOINING();
+#endif
+  // If the team size was reduced from the limit, set it to the new size
+  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
+    thr->th.th_teams_size.nth = thr->th.th_team_nproc;
+  // AC: last parameter "1" eliminates join barrier which won't work because
+  // worker threads are in a fork barrier waiting for more parallel regions
+  __kmp_join_call(loc, gtid
+#if OMPT_SUPPORT
+                  ,
+                  fork_context_intel
+#endif
+                  ,
+                  1);
+}
+
+int __kmp_invoke_teams_master(int gtid) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+  kmp_team_t *team = this_thr->th.th_team;
+#if KMP_DEBUG
+  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
+    KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
+                     (void *)__kmp_teams_master);
+#endif
+  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
+  __kmp_teams_master(gtid);
+  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
+  return 1;
+}
+
+/* this sets the requested number of threads for the next parallel region
+   encountered by this team. since this should be enclosed in the forkjoin
+   critical section it should avoid race conditions with assymmetrical nested
+   parallelism */
+
+void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+
+  if (num_threads > 0)
+    thr->th.th_set_nproc = num_threads;
+}
+
+/* this sets the requested number of teams for the teams region and/or
+   the number of threads for the next parallel region encountered  */
+void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
+                          int num_threads) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(num_teams >= 0);
+  KMP_DEBUG_ASSERT(num_threads >= 0);
+
+  if (num_teams == 0)
+    num_teams = 1; // default number of teams is 1.
+  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
+    if (!__kmp_reserve_warn) {
+      __kmp_reserve_warn = 1;
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
+                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+    }
+    num_teams = __kmp_teams_max_nth;
+  }
+  // Set number of teams (number of threads in the outer "parallel" of the
+  // teams)
+  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
+
+  // Remember the number of threads for inner parallel regions
+  if (num_threads == 0) {
+    if (!TCR_4(__kmp_init_middle))
+      __kmp_middle_initialize(); // get __kmp_avail_proc calculated
+    num_threads = __kmp_avail_proc / num_teams;
+    if (num_teams * num_threads > __kmp_teams_max_nth) {
+      // adjust num_threads w/o warning as it is not user setting
+      num_threads = __kmp_teams_max_nth / num_teams;
+    }
+  } else {
+    // This thread will be the master of the league masters
+    // Store new thread limit; old limit is saved in th_cg_roots list
+    thr->th.th_current_task->td_icvs.thread_limit = num_threads;
+
+    if (num_teams * num_threads > __kmp_teams_max_nth) {
+      int new_threads = __kmp_teams_max_nth / num_teams;
+      if (!__kmp_reserve_warn) { // user asked for too many threads
+        __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
+                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
+      }
+      num_threads = new_threads;
+    }
+  }
+  thr->th.th_teams_size.nth = num_threads;
+}
+
+// Set the proc_bind var to use in the following parallel region.
+void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  thr->th.th_set_proc_bind = proc_bind;
+}
+
+/* Launch the worker threads into the microtask. */
+
+void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+#ifdef KMP_DEBUG
+  int f;
+#endif /* KMP_DEBUG */
+
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
+  KMP_ASSERT(KMP_MASTER_GTID(gtid));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  team->t.t_construct = 0; /* no single directives seen yet */
+  team->t.t_ordered.dt.t_value =
+      0; /* thread 0 enters the ordered section first */
+
+  /* Reset the identifiers on the dispatch buffer */
+  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
+  if (team->t.t_max_nproc > 1) {
+    int i;
+    for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
+      team->t.t_disp_buffer[i].buffer_index = i;
+      team->t.t_disp_buffer[i].doacross_buf_idx = i;
+    }
+  } else {
+    team->t.t_disp_buffer[0].buffer_index = 0;
+    team->t.t_disp_buffer[0].doacross_buf_idx = 0;
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  KMP_ASSERT(this_thr->th.th_team == team);
+
+#ifdef KMP_DEBUG
+  for (f = 0; f < team->t.t_nproc; f++) {
+    KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
+                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
+  }
+#endif /* KMP_DEBUG */
+
+  /* release the worker threads so they may begin working */
+  __kmp_fork_barrier(gtid, 0);
+}
+
+void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
+  kmp_info_t *this_thr = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(team);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
+  KMP_ASSERT(KMP_MASTER_GTID(gtid));
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+/* Join barrier after fork */
+
+#ifdef KMP_DEBUG
+  if (__kmp_threads[gtid] &&
+      __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
+    __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
+                 __kmp_threads[gtid]);
+    __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
+                 "team->t.t_nproc=%d\n",
+                 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
+                 team->t.t_nproc);
+    __kmp_print_structure();
+  }
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
+                   __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
+#endif /* KMP_DEBUG */
+
+  __kmp_join_barrier(gtid); /* wait for everyone */
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled &&
+      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+    int ds_tid = this_thr->th.th_info.ds.ds_tid;
+    ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (KMP_MASTER_TID(ds_tid) &&
+        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
+         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
+      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
+
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
+          codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+    }
+  }
+#endif
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  KMP_ASSERT(this_thr->th.th_team == team);
+}
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef USE_LOAD_BALANCE
+
+// Return the worker threads actively spinning in the hot team, if we
+// are at the outermost level of parallelism.  Otherwise, return 0.
+static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
+  int i;
+  int retval;
+  kmp_team_t *hot_team;
+
+  if (root->r.r_active) {
+    return 0;
+  }
+  hot_team = root->r.r_hot_team;
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+    return hot_team->t.t_nproc - 1; // Don't count master thread
+  }
+
+  // Skip the master thread - it is accounted for elsewhere.
+  retval = 0;
+  for (i = 1; i < hot_team->t.t_nproc; i++) {
+    if (hot_team->t.t_threads[i]->th.th_active) {
+      retval++;
+    }
+  }
+  return retval;
+}
+
+// Perform an automatic adjustment to the number of
+// threads used by the next parallel region.
+static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
+  int retval;
+  int pool_active;
+  int hot_team_active;
+  int team_curr_active;
+  int system_active;
+
+  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
+                set_nproc));
+  KMP_DEBUG_ASSERT(root);
+  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
+                       ->th.th_current_task->td_icvs.dynamic == TRUE);
+  KMP_DEBUG_ASSERT(set_nproc > 1);
+
+  if (set_nproc == 1) {
+    KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
+    return 1;
+  }
+
+  // Threads that are active in the thread pool, active in the hot team for this
+  // particular root (if we are at the outer par level), and the currently
+  // executing thread (to become the master) are available to add to the new
+  // team, but are currently contributing to the system load, and must be
+  // accounted for.
+  pool_active = __kmp_thread_pool_active_nth;
+  hot_team_active = __kmp_active_hot_team_nproc(root);
+  team_curr_active = pool_active + hot_team_active + 1;
+
+  // Check the system load.
+  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
+  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
+                "hot team active = %d\n",
+                system_active, pool_active, hot_team_active));
+
+  if (system_active < 0) {
+    // There was an error reading the necessary info from /proc, so use the
+    // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
+    // = dynamic_thread_limit, we shouldn't wind up getting back here.
+    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+    KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
+
+    // Make this call behave like the thread limit algorithm.
+    retval = __kmp_avail_proc - __kmp_nth +
+             (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
+    if (retval > set_nproc) {
+      retval = set_nproc;
+    }
+    if (retval < KMP_MIN_NTH) {
+      retval = KMP_MIN_NTH;
+    }
+
+    KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
+                  retval));
+    return retval;
+  }
+
+  // There is a slight delay in the load balance algorithm in detecting new
+  // running procs. The real system load at this instant should be at least as
+  // large as the #active omp thread that are available to add to the team.
+  if (system_active < team_curr_active) {
+    system_active = team_curr_active;
+  }
+  retval = __kmp_avail_proc - system_active + team_curr_active;
+  if (retval > set_nproc) {
+    retval = set_nproc;
+  }
+  if (retval < KMP_MIN_NTH) {
+    retval = KMP_MIN_NTH;
+  }
+
+  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
+  return retval;
+} // __kmp_load_balance_nproc()
+
+#endif /* USE_LOAD_BALANCE */
+
+/* ------------------------------------------------------------------------ */
+
+/* NOTE: this is called with the __kmp_init_lock held */
+void __kmp_cleanup(void) {
+  int f;
+
+  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
+
+  if (TCR_4(__kmp_init_parallel)) {
+#if KMP_HANDLE_SIGNALS
+    __kmp_remove_signals();
+#endif
+    TCW_4(__kmp_init_parallel, FALSE);
+  }
+
+  if (TCR_4(__kmp_init_middle)) {
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_affinity_uninitialize();
+#endif /* KMP_AFFINITY_SUPPORTED */
+    __kmp_cleanup_hierarchy();
+    TCW_4(__kmp_init_middle, FALSE);
+  }
+
+  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
+
+  if (__kmp_init_serial) {
+    __kmp_runtime_destroy();
+    __kmp_init_serial = FALSE;
+  }
+
+  __kmp_cleanup_threadprivate_caches();
+
+  for (f = 0; f < __kmp_threads_capacity; f++) {
+    if (__kmp_root[f] != NULL) {
+      __kmp_free(__kmp_root[f]);
+      __kmp_root[f] = NULL;
+    }
+  }
+  __kmp_free(__kmp_threads);
+  // __kmp_threads and __kmp_root were allocated at once, as single block, so
+  // there is no need in freeing __kmp_root.
+  __kmp_threads = NULL;
+  __kmp_root = NULL;
+  __kmp_threads_capacity = 0;
+
+#if KMP_USE_DYNAMIC_LOCK
+  __kmp_cleanup_indirect_user_locks();
+#else
+  __kmp_cleanup_user_locks();
+#endif
+
+#if KMP_AFFINITY_SUPPORTED
+  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
+  __kmp_cpuinfo_file = NULL;
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+#if KMP_USE_ADAPTIVE_LOCKS
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+  __kmp_print_speculative_stats();
+#endif
+#endif
+  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
+  __kmp_nested_nth.nth = NULL;
+  __kmp_nested_nth.size = 0;
+  __kmp_nested_nth.used = 0;
+  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
+  __kmp_nested_proc_bind.bind_types = NULL;
+  __kmp_nested_proc_bind.size = 0;
+  __kmp_nested_proc_bind.used = 0;
+  if (__kmp_affinity_format) {
+    KMP_INTERNAL_FREE(__kmp_affinity_format);
+    __kmp_affinity_format = NULL;
+  }
+
+  __kmp_i18n_catclose();
+
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+
+#if KMP_STATS_ENABLED
+  __kmp_stats_fini();
+#endif
+
+  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
+}
+
+/* ------------------------------------------------------------------------ */
+
+int __kmp_ignore_mppbeg(void) {
+  char *env;
+
+  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
+    if (__kmp_str_match_false(env))
+      return FALSE;
+  }
+  // By default __kmpc_begin() is no-op.
+  return TRUE;
+}
+
+int __kmp_ignore_mppend(void) {
+  char *env;
+
+  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
+    if (__kmp_str_match_false(env))
+      return FALSE;
+  }
+  // By default __kmpc_end() is no-op.
+  return TRUE;
+}
+
+void __kmp_internal_begin(void) {
+  int gtid;
+  kmp_root_t *root;
+
+  /* this is a very important step as it will register new sibling threads
+     and assign these new uber threads a new gtid */
+  gtid = __kmp_entry_gtid();
+  root = __kmp_threads[gtid]->th.th_root;
+  KMP_ASSERT(KMP_UBER_GTID(gtid));
+
+  if (root->r.r_begin)
+    return;
+  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
+  if (root->r.r_begin) {
+    __kmp_release_lock(&root->r.r_begin_lock, gtid);
+    return;
+  }
+
+  root->r.r_begin = TRUE;
+
+  __kmp_release_lock(&root->r.r_begin_lock, gtid);
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_user_set_library(enum library_type arg) {
+  int gtid;
+  kmp_root_t *root;
+  kmp_info_t *thread;
+
+  /* first, make sure we are initialized so we can get our gtid */
+
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_threads[gtid];
+
+  root = thread->th.th_root;
+
+  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
+                library_serial));
+  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
+                                  thread */
+    KMP_WARNING(SetLibraryIncorrectCall);
+    return;
+  }
+
+  switch (arg) {
+  case library_serial:
+    thread->th.th_set_nproc = 0;
+    set__nproc(thread, 1);
+    break;
+  case library_turnaround:
+    thread->th.th_set_nproc = 0;
+    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
+                                           : __kmp_dflt_team_nth_ub);
+    break;
+  case library_throughput:
+    thread->th.th_set_nproc = 0;
+    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
+                                           : __kmp_dflt_team_nth_ub);
+    break;
+  default:
+    KMP_FATAL(UnknownLibraryType, arg);
+  }
+
+  __kmp_aux_set_library(arg);
+}
+
+void __kmp_aux_set_stacksize(size_t arg) {
+  if (!__kmp_init_serial)
+    __kmp_serial_initialize();
+
+#if KMP_OS_DARWIN
+  if (arg & (0x1000 - 1)) {
+    arg &= ~(0x1000 - 1);
+    if (arg + 0x1000) /* check for overflow if we round up */
+      arg += 0x1000;
+  }
+#endif
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+
+  /* only change the default stacksize before the first parallel region */
+  if (!TCR_4(__kmp_init_parallel)) {
+    size_t value = arg; /* argument is in bytes */
+
+    if (value < __kmp_sys_min_stksize)
+      value = __kmp_sys_min_stksize;
+    else if (value > KMP_MAX_STKSIZE)
+      value = KMP_MAX_STKSIZE;
+
+    __kmp_stksize = value;
+
+    __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
+  }
+
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+}
+
+/* set the behaviour of the runtime library */
+/* TODO this can cause some odd behaviour with sibling parallelism... */
+void __kmp_aux_set_library(enum library_type arg) {
+  __kmp_library = arg;
+
+  switch (__kmp_library) {
+  case library_serial: {
+    KMP_INFORM(LibraryIsSerial);
+  } break;
+  case library_turnaround:
+    if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
+      __kmp_use_yield = 2; // only yield when oversubscribed
+    break;
+  case library_throughput:
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
+      __kmp_dflt_blocktime = 200;
+    break;
+  default:
+    KMP_FATAL(UnknownLibraryType, arg);
+  }
+}
+
+/* Getting team information common for all team API */
+// Returns NULL if not in teams construct
+static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
+  kmp_info_t *thr = __kmp_entry_thread();
+  teams_serialized = 0;
+  if (thr->th.th_teams_microtask) {
+    kmp_team_t *team = thr->th.th_team;
+    int tlevel = thr->th.th_teams_level; // the level of the teams construct
+    int ii = team->t.t_level;
+    teams_serialized = team->t.t_serialized;
+    int level = tlevel + 1;
+    KMP_DEBUG_ASSERT(ii >= tlevel);
+    while (ii > level) {
+      for (teams_serialized = team->t.t_serialized;
+           (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
+      }
+      if (team->t.t_serialized && (!teams_serialized)) {
+        team = team->t.t_parent;
+        continue;
+      }
+      if (ii > level) {
+        team = team->t.t_parent;
+        ii--;
+      }
+    }
+    return team;
+  }
+  return NULL;
+}
+
+int __kmp_aux_get_team_num() {
+  int serialized;
+  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
+  if (team) {
+    if (serialized > 1) {
+      return 0; // teams region is serialized ( 1 team of 1 thread ).
+    } else {
+      return team->t.t_master_tid;
+    }
+  }
+  return 0;
+}
+
+int __kmp_aux_get_num_teams() {
+  int serialized;
+  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
+  if (team) {
+    if (serialized > 1) {
+      return 1;
+    } else {
+      return team->t.t_parent->t.t_nproc;
+    }
+  }
+  return 1;
+}
+
+/* ------------------------------------------------------------------------ */
+
+/*
+ * Affinity Format Parser
+ *
+ * Field is in form of: %[[[0].]size]type
+ * % and type are required (%% means print a literal '%')
+ * type is either single char or long name surrounded by {},
+ * e.g., N or {num_threads}
+ * 0 => leading zeros
+ * . => right justified when size is specified
+ * by default output is left justified
+ * size is the *minimum* field length
+ * All other characters are printed as is
+ *
+ * Available field types:
+ * L {thread_level}      - omp_get_level()
+ * n {thread_num}        - omp_get_thread_num()
+ * h {host}              - name of host machine
+ * P {process_id}        - process id (integer)
+ * T {thread_identifier} - native thread identifier (integer)
+ * N {num_threads}       - omp_get_num_threads()
+ * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
+ * a {thread_affinity}   - comma separated list of integers or integer ranges
+ *                         (values of affinity mask)
+ *
+ * Implementation-specific field types can be added
+ * If a type is unknown, print "undefined"
+*/
+
+// Structure holding the short name, long name, and corresponding data type
+// for snprintf.  A table of these will represent the entire valid keyword
+// field types.
+typedef struct kmp_affinity_format_field_t {
+  char short_name; // from spec e.g., L -> thread level
+  const char *long_name; // from spec thread_level -> thread level
+  char field_format; // data type for snprintf (typically 'd' or 's'
+  // for integer or string)
+} kmp_affinity_format_field_t;
+
+static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
+#if KMP_AFFINITY_SUPPORTED
+    {'A', "thread_affinity", 's'},
+#endif
+    {'t', "team_num", 'd'},
+    {'T', "num_teams", 'd'},
+    {'L', "nesting_level", 'd'},
+    {'n', "thread_num", 'd'},
+    {'N', "num_threads", 'd'},
+    {'a', "ancestor_tnum", 'd'},
+    {'H', "host", 's'},
+    {'P', "process_id", 'd'},
+    {'i', "native_thread_id", 'd'}};
+
+// Return the number of characters it takes to hold field
+static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
+                                            const char **ptr,
+                                            kmp_str_buf_t *field_buffer) {
+  int rc, format_index, field_value;
+  const char *width_left, *width_right;
+  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
+  static const int FORMAT_SIZE = 20;
+  char format[FORMAT_SIZE] = {0};
+  char absolute_short_name = 0;
+
+  KMP_DEBUG_ASSERT(gtid >= 0);
+  KMP_DEBUG_ASSERT(th);
+  KMP_DEBUG_ASSERT(**ptr == '%');
+  KMP_DEBUG_ASSERT(field_buffer);
+
+  __kmp_str_buf_clear(field_buffer);
+
+  // Skip the initial %
+  (*ptr)++;
+
+  // Check for %% first
+  if (**ptr == '%') {
+    __kmp_str_buf_cat(field_buffer, "%", 1);
+    (*ptr)++; // skip over the second %
+    return 1;
+  }
+
+  // Parse field modifiers if they are present
+  pad_zeros = false;
+  if (**ptr == '0') {
+    pad_zeros = true;
+    (*ptr)++; // skip over 0
+  }
+  right_justify = false;
+  if (**ptr == '.') {
+    right_justify = true;
+    (*ptr)++; // skip over .
+  }
+  // Parse width of field: [width_left, width_right)
+  width_left = width_right = NULL;
+  if (**ptr >= '0' && **ptr <= '9') {
+    width_left = *ptr;
+    SKIP_DIGITS(*ptr);
+    width_right = *ptr;
+  }
+
+  // Create the format for KMP_SNPRINTF based on flags parsed above
+  format_index = 0;
+  format[format_index++] = '%';
+  if (!right_justify)
+    format[format_index++] = '-';
+  if (pad_zeros)
+    format[format_index++] = '0';
+  if (width_left && width_right) {
+    int i = 0;
+    // Only allow 8 digit number widths.
+    // This also prevents overflowing format variable
+    while (i < 8 && width_left < width_right) {
+      format[format_index++] = *width_left;
+      width_left++;
+      i++;
+    }
+  }
+
+  // Parse a name (long or short)
+  // Canonicalize the name into absolute_short_name
+  found_valid_name = false;
+  parse_long_name = (**ptr == '{');
+  if (parse_long_name)
+    (*ptr)++; // skip initial left brace
+  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
+                             sizeof(__kmp_affinity_format_table[0]);
+       ++i) {
+    char short_name = __kmp_affinity_format_table[i].short_name;
+    const char *long_name = __kmp_affinity_format_table[i].long_name;
+    char field_format = __kmp_affinity_format_table[i].field_format;
+    if (parse_long_name) {
+      int length = KMP_STRLEN(long_name);
+      if (strncmp(*ptr, long_name, length) == 0) {
+        found_valid_name = true;
+        (*ptr) += length; // skip the long name
+      }
+    } else if (**ptr == short_name) {
+      found_valid_name = true;
+      (*ptr)++; // skip the short name
+    }
+    if (found_valid_name) {
+      format[format_index++] = field_format;
+      format[format_index++] = '\0';
+      absolute_short_name = short_name;
+      break;
+    }
+  }
+  if (parse_long_name) {
+    if (**ptr != '}') {
+      absolute_short_name = 0;
+    } else {
+      (*ptr)++; // skip over the right brace
+    }
+  }
+
+  // Attempt to fill the buffer with the requested
+  // value using snprintf within __kmp_str_buf_print()
+  switch (absolute_short_name) {
+  case 't':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
+    break;
+  case 'T':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
+    break;
+  case 'L':
+    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
+    break;
+  case 'n':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
+    break;
+  case 'H': {
+    static const int BUFFER_SIZE = 256;
+    char buf[BUFFER_SIZE];
+    __kmp_expand_host_name(buf, BUFFER_SIZE);
+    rc = __kmp_str_buf_print(field_buffer, format, buf);
+  } break;
+  case 'P':
+    rc = __kmp_str_buf_print(field_buffer, format, getpid());
+    break;
+  case 'i':
+    rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
+    break;
+  case 'N':
+    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
+    break;
+  case 'a':
+    field_value =
+        __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
+    rc = __kmp_str_buf_print(field_buffer, format, field_value);
+    break;
+#if KMP_AFFINITY_SUPPORTED
+  case 'A': {
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
+    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
+    __kmp_str_buf_free(&buf);
+  } break;
+#endif
+  default:
+    // According to spec, If an implementation does not have info for field
+    // type, then "undefined" is printed
+    rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
+    // Skip the field
+    if (parse_long_name) {
+      SKIP_TOKEN(*ptr);
+      if (**ptr == '}')
+        (*ptr)++;
+    } else {
+      (*ptr)++;
+    }
+  }
+
+  KMP_ASSERT(format_index <= FORMAT_SIZE);
+  return rc;
+}
+
+/*
+ * Return number of characters needed to hold the affinity string
+ * (not including null byte character)
+ * The resultant string is printed to buffer, which the caller can then
+ * handle afterwards
+*/
+size_t __kmp_aux_capture_affinity(int gtid, const char *format,
+                                  kmp_str_buf_t *buffer) {
+  const char *parse_ptr;
+  size_t retval;
+  const kmp_info_t *th;
+  kmp_str_buf_t field;
+
+  KMP_DEBUG_ASSERT(buffer);
+  KMP_DEBUG_ASSERT(gtid >= 0);
+
+  __kmp_str_buf_init(&field);
+  __kmp_str_buf_clear(buffer);
+
+  th = __kmp_threads[gtid];
+  retval = 0;
+
+  // If format is NULL or zero-length string, then we use
+  // affinity-format-var ICV
+  parse_ptr = format;
+  if (parse_ptr == NULL || *parse_ptr == '\0') {
+    parse_ptr = __kmp_affinity_format;
+  }
+  KMP_DEBUG_ASSERT(parse_ptr);
+
+  while (*parse_ptr != '\0') {
+    // Parse a field
+    if (*parse_ptr == '%') {
+      // Put field in the buffer
+      int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
+      __kmp_str_buf_catbuf(buffer, &field);
+      retval += rc;
+    } else {
+      // Put literal character in buffer
+      __kmp_str_buf_cat(buffer, parse_ptr, 1);
+      retval++;
+      parse_ptr++;
+    }
+  }
+  __kmp_str_buf_free(&field);
+  return retval;
+}
+
+// Displays the affinity string to stdout
+void __kmp_aux_display_affinity(int gtid, const char *format) {
+  kmp_str_buf_t buf;
+  __kmp_str_buf_init(&buf);
+  __kmp_aux_capture_affinity(gtid, format, &buf);
+  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
+  __kmp_str_buf_free(&buf);
+}
+
+/* ------------------------------------------------------------------------ */
+
+void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
+  int blocktime = arg; /* argument is in milliseconds */
+#if KMP_USE_MONITOR
+  int bt_intervals;
+#endif
+  int bt_set;
+
+  __kmp_save_internal_controls(thread);
+
+  /* Normalize and set blocktime for the teams */
+  if (blocktime < KMP_MIN_BLOCKTIME)
+    blocktime = KMP_MIN_BLOCKTIME;
+  else if (blocktime > KMP_MAX_BLOCKTIME)
+    blocktime = KMP_MAX_BLOCKTIME;
+
+  set__blocktime_team(thread->th.th_team, tid, blocktime);
+  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
+
+#if KMP_USE_MONITOR
+  /* Calculate and set blocktime intervals for the teams */
+  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
+
+  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
+  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
+#endif
+
+  /* Set whether blocktime has been set to "TRUE" */
+  bt_set = TRUE;
+
+  set__bt_set_team(thread->th.th_team, tid, bt_set);
+  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
+#if KMP_USE_MONITOR
+  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
+                "bt_intervals=%d, monitor_updates=%d\n",
+                __kmp_gtid_from_tid(tid, thread->th.th_team),
+                thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
+                __kmp_monitor_wakeups));
+#else
+  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
+                __kmp_gtid_from_tid(tid, thread->th.th_team),
+                thread->th.th_team->t.t_id, tid, blocktime));
+#endif
+}
+
+void __kmp_aux_set_defaults(char const *str, int len) {
+  if (!__kmp_init_serial) {
+    __kmp_serial_initialize();
+  }
+  __kmp_env_initialize(str);
+
+  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
+    __kmp_env_print();
+  }
+} // __kmp_aux_set_defaults
+
+/* ------------------------------------------------------------------------ */
+/* internal fast reduction routines */
+
+PACKED_REDUCTION_METHOD_T
+__kmp_determine_reduction_method(
+    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
+    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+    kmp_critical_name *lck) {
+
+  // Default reduction method: critical construct ( lck != NULL, like in current
+  // PAROPT )
+  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
+  // can be selected by RTL
+  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
+  // can be selected by RTL
+  // Finally, it's up to OpenMP RTL to make a decision on which method to select
+  // among generated by PAROPT.
+
+  PACKED_REDUCTION_METHOD_T retval;
+
+  int team_size;
+
+  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
+  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
+
+#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
+  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
+#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
+
+  retval = critical_reduce_block;
+
+  // another choice of getting a team size (with 1 dynamic deference) is slower
+  team_size = __kmp_get_team_num_threads(global_tid);
+  if (team_size == 1) {
+
+    retval = empty_reduce_block;
+
+  } else {
+
+    int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
+
+#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+
+    int teamsize_cutoff = 4;
+
+#if KMP_MIC_SUPPORTED
+    if (__kmp_mic_type != non_mic) {
+      teamsize_cutoff = 8;
+    }
+#endif
+    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+    if (tree_available) {
+      if (team_size <= teamsize_cutoff) {
+        if (atomic_available) {
+          retval = atomic_reduce_block;
+        }
+      } else {
+        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
+      }
+    } else if (atomic_available) {
+      retval = atomic_reduce_block;
+    }
+#else
+#error "Unknown or unsupported OS"
+#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
+       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+
+#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
+
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
+
+    // basic tuning
+
+    if (atomic_available) {
+      if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
+        retval = atomic_reduce_block;
+      }
+    } // otherwise: use critical section
+
+#elif KMP_OS_DARWIN
+
+    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+    if (atomic_available && (num_vars <= 3)) {
+      retval = atomic_reduce_block;
+    } else if (tree_available) {
+      if ((reduce_size > (9 * sizeof(kmp_real64))) &&
+          (reduce_size < (2000 * sizeof(kmp_real64)))) {
+        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
+      }
+    } // otherwise: use critical section
+
+#else
+#error "Unknown or unsupported OS"
+#endif
+
+#else
+#error "Unknown or unsupported architecture"
+#endif
+  }
+
+  // KMP_FORCE_REDUCTION
+
+  // If the team is serialized (team_size == 1), ignore the forced reduction
+  // method and stay with the unsynchronized method (empty_reduce_block)
+  if (__kmp_force_reduction_method != reduction_method_not_defined &&
+      team_size != 1) {
+
+    PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
+
+    int atomic_available, tree_available;
+
+    switch ((forced_retval = __kmp_force_reduction_method)) {
+    case critical_reduce_block:
+      KMP_ASSERT(lck); // lck should be != 0
+      break;
+
+    case atomic_reduce_block:
+      atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
+      if (!atomic_available) {
+        KMP_WARNING(RedMethodNotSupported, "atomic");
+        forced_retval = critical_reduce_block;
+      }
+      break;
+
+    case tree_reduce_block:
+      tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
+      if (!tree_available) {
+        KMP_WARNING(RedMethodNotSupported, "tree");
+        forced_retval = critical_reduce_block;
+      } else {
+#if KMP_FAST_REDUCTION_BARRIER
+        forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
+#endif
+      }
+      break;
+
+    default:
+      KMP_ASSERT(0); // "unsupported method specified"
+    }
+
+    retval = forced_retval;
+  }
+
+  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
+
+#undef FAST_REDUCTION_TREE_METHOD_GENERATED
+#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
+
+  return (retval);
+}
+
+// this function is for testing set/get/determine reduce method
+kmp_int32 __kmp_get_reduce_method(void) {
+  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
+}
+
+// Soft pause sets up threads to ignore blocktime and just go to sleep.
+// Spin-wait code checks __kmp_pause_status and reacts accordingly.
+void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
+
+// Hard pause shuts down the runtime completely.  Resume happens naturally when
+// OpenMP is used subsequently.
+void __kmp_hard_pause() {
+  __kmp_pause_status = kmp_hard_paused;
+  __kmp_internal_end_thread(-1);
+}
+
+// Soft resume sets __kmp_pause_status, and wakes up all threads.
+void __kmp_resume_if_soft_paused() {
+  if (__kmp_pause_status == kmp_soft_paused) {
+    __kmp_pause_status = kmp_not_paused;
+
+    for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
+      kmp_info_t *thread = __kmp_threads[gtid];
+      if (thread) { // Wake it if sleeping
+        kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
+        if (fl.is_sleeping())
+          fl.resume(gtid);
+        else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
+          __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
+        } else { // thread holds the lock and may sleep soon
+          do { // until either the thread sleeps, or we can get the lock
+            if (fl.is_sleeping()) {
+              fl.resume(gtid);
+              break;
+            } else if (__kmp_try_suspend_mx(thread)) {
+              __kmp_unlock_suspend_mx(thread);
+              break;
+            }
+          } while (1);
+        }
+      }
+    }
+  }
+}
+
+// This function is called via __kmpc_pause_resource. Returns 0 if successful.
+// TODO: add warning messages
+int __kmp_pause_resource(kmp_pause_status_t level) {
+  if (level == kmp_not_paused) { // requesting resume
+    if (__kmp_pause_status == kmp_not_paused) {
+      // error message about runtime not being paused, so can't resume
+      return 1;
+    } else {
+      KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
+                       __kmp_pause_status == kmp_hard_paused);
+      __kmp_pause_status = kmp_not_paused;
+      return 0;
+    }
+  } else if (level == kmp_soft_paused) { // requesting soft pause
+    if (__kmp_pause_status != kmp_not_paused) {
+      // error message about already being paused
+      return 1;
+    } else {
+      __kmp_soft_pause();
+      return 0;
+    }
+  } else if (level == kmp_hard_paused) { // requesting hard pause
+    if (__kmp_pause_status != kmp_not_paused) {
+      // error message about already being paused
+      return 1;
+    } else {
+      __kmp_hard_pause();
+      return 0;
+    }
+  } else {
+    // error message about invalid level
+    return 1;
+  }
+}
diff --git a/final/runtime/src/kmp_safe_c_api.h b/final/runtime/src/kmp_safe_c_api.h
new file mode 100644
index 0000000..f839f73
--- /dev/null
+++ b/final/runtime/src/kmp_safe_c_api.h
@@ -0,0 +1,74 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_SAFE_C_API_H
+#define KMP_SAFE_C_API_H
+
+#include "kmp_platform.h"
+#include <string.h>
+
+// Replacement for banned C API
+
+// Not every unsafe call listed here is handled now, but keeping everything
+// in one place should be handy for future maintenance.
+#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
+
+#define RSIZE_MAX_STR (4UL << 10) // 4KB
+
+// _malloca was suggested, but it is not a drop-in replacement for _alloca
+#define KMP_ALLOCA _alloca
+
+#define KMP_MEMCPY_S memcpy_s
+#define KMP_SNPRINTF sprintf_s
+#define KMP_SSCANF sscanf_s
+#define KMP_STRCPY_S strcpy_s
+#define KMP_STRNCPY_S strncpy_s
+
+// Use this only when buffer size is unknown
+#define KMP_MEMCPY(dst, src, cnt) memcpy_s(dst, cnt, src, cnt)
+
+#define KMP_STRLEN(str) strnlen_s(str, RSIZE_MAX_STR)
+
+// Use this only when buffer size is unknown
+#define KMP_STRNCPY(dst, src, cnt) strncpy_s(dst, cnt, src, cnt)
+
+// _TRUNCATE insures buffer size > max string to print.
+#define KMP_VSNPRINTF(dst, cnt, fmt, arg)                                      \
+  vsnprintf_s(dst, cnt, _TRUNCATE, fmt, arg)
+
+#else // KMP_OS_WINDOWS
+
+// For now, these macros use the existing API.
+
+#define KMP_ALLOCA alloca
+#define KMP_MEMCPY_S(dst, bsz, src, cnt) memcpy(dst, src, cnt)
+#define KMP_SNPRINTF snprintf
+#define KMP_SSCANF sscanf
+#define KMP_STRCPY_S(dst, bsz, src) strcpy(dst, src)
+#define KMP_STRNCPY_S(dst, bsz, src, cnt) strncpy(dst, src, cnt)
+#define KMP_VSNPRINTF vsnprintf
+#define KMP_STRNCPY strncpy
+#define KMP_STRLEN strlen
+#define KMP_MEMCPY memcpy
+
+#endif // KMP_OS_WINDOWS
+
+// Offer truncated version of strncpy
+static inline void __kmp_strncpy_truncate(char *buffer, size_t buf_size,
+                                          char const *src, size_t src_size) {
+  if (src_size >= buf_size) {
+    src_size = buf_size - 1;
+    KMP_STRNCPY_S(buffer, buf_size, src, src_size);
+    buffer[buf_size - 1] = '\0';
+  } else {
+    KMP_STRNCPY_S(buffer, buf_size, src, src_size);
+  }
+}
+
+#endif // KMP_SAFE_C_API_H
diff --git a/final/runtime/src/kmp_sched.cpp b/final/runtime/src/kmp_sched.cpp
new file mode 100644
index 0000000..17c1498
--- /dev/null
+++ b/final/runtime/src/kmp_sched.cpp
@@ -0,0 +1,1004 @@
+/*
+ * kmp_sched.cpp -- static scheduling -- iteration initialization
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/* Static scheduling initialization.
+
+  NOTE: team->t.t_nproc is a constant inside of any dispatch loop, however
+        it may change values between parallel regions.  __kmp_max_nth
+        is the largest value __kmp_nth may take, 1 is the smallest. */
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#ifdef KMP_DEBUG
+//-------------------------------------------------------------------------
+// template for debug prints specification ( d, u, lld, llu )
+char const *traits_t<int>::spec = "d";
+char const *traits_t<unsigned int>::spec = "u";
+char const *traits_t<long long>::spec = "lld";
+char const *traits_t<unsigned long long>::spec = "llu";
+char const *traits_t<long>::spec = "ld";
+//-------------------------------------------------------------------------
+#endif
+
+#if KMP_STATS_ENABLED
+#define KMP_STATS_LOOP_END(stat)                                               \
+  {                                                                            \
+    kmp_int64 t;                                                               \
+    kmp_int64 u = (kmp_int64)(*pupper);                                        \
+    kmp_int64 l = (kmp_int64)(*plower);                                        \
+    kmp_int64 i = (kmp_int64)incr;                                             \
+    if (i == 1) {                                                              \
+      t = u - l + 1;                                                           \
+    } else if (i == -1) {                                                      \
+      t = l - u + 1;                                                           \
+    } else if (i > 0) {                                                        \
+      t = (u - l) / i + 1;                                                     \
+    } else {                                                                   \
+      t = (l - u) / (-i) + 1;                                                  \
+    }                                                                          \
+    KMP_COUNT_VALUE(stat, t);                                                  \
+    KMP_POP_PARTITIONED_TIMER();                                               \
+  }
+#else
+#define KMP_STATS_LOOP_END(stat) /* Nothing */
+#endif
+
+template <typename T>
+static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_int32 schedtype, kmp_int32 *plastiter,
+                                  T *plower, T *pupper,
+                                  typename traits_t<T>::signed_t *pstride,
+                                  typename traits_t<T>::signed_t incr,
+                                  typename traits_t<T>::signed_t chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                  ,
+                                  void *codeptr
+#endif
+                                  ) {
+  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  /*  this all has to be changed back to TID and such.. */
+  kmp_int32 gtid = global_tid;
+  kmp_uint32 tid;
+  kmp_uint32 nth;
+  UT trip_count;
+  kmp_team_t *team;
+  kmp_info_t *th = __kmp_threads[gtid];
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_team_info_t *team_info = NULL;
+  ompt_task_info_t *task_info = NULL;
+  ompt_work_t ompt_work_type = ompt_work_loop;
+
+  static kmp_int8 warn = 0;
+
+  if (ompt_enabled.ompt_callback_work) {
+    // Only fully initialize variables needed by OMPT if OMPT is enabled.
+    team_info = __ompt_get_teaminfo(0, NULL);
+    task_info = __ompt_get_task_info_object(0);
+    // Determine workshare type
+    if (loc != NULL) {
+      if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
+        ompt_work_type = ompt_work_loop;
+      } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
+        ompt_work_type = ompt_work_sections;
+      } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
+        ompt_work_type = ompt_work_distribute;
+      } else {
+        kmp_int8 bool_res =
+            KMP_COMPARE_AND_STORE_ACQ8(&warn, (kmp_int8)0, (kmp_int8)1);
+        if (bool_res)
+          KMP_WARNING(OmptOutdatedWorkshare);
+      }
+      KMP_DEBUG_ASSERT(ompt_work_type);
+    }
+  }
+#endif
+
+  KMP_DEBUG_ASSERT(plastiter && plower && pupper && pstride);
+  KE_TRACE(10, ("__kmpc_for_static_init called (%d)\n", global_tid));
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmpc_for_static_init: T#%%d sched=%%d liter=%%d iter=(%%%s,"
+        " %%%s, %%%s) incr=%%%s chunk=%%%s signed?<%s>\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
+        traits_t<ST>::spec, traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, global_tid, schedtype, *plastiter, *plower, *pupper,
+                   *pstride, incr, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_push_workshare(global_tid, ct_pdo, loc);
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+  }
+  /* special handling for zero-trip loops */
+  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
+    if (plastiter != NULL)
+      *plastiter = FALSE;
+    /* leave pupper and plower set to entire iteration space */
+    *pstride = incr; /* value should never be used */
+// *plower = *pupper - incr;
+// let compiler bypass the illegal loop (like for(i=1;i<10;i--))
+// THE LINE COMMENTED ABOVE CAUSED shape2F/h_tests_1.f TO HAVE A FAILURE
+// ON A ZERO-TRIP LOOP (lower=1, upper=0,stride=1) - JPH June 23, 2009.
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmpc_for_static_init:(ZERO TRIP) liter=%%d "
+                              "lower=%%%s upper=%%%s stride = %%%s "
+                              "signed?<%s>, loc = %%s\n",
+                              traits_t<T>::spec, traits_t<T>::spec,
+                              traits_t<ST>::spec, traits_t<T>::spec);
+      KD_TRACE(100,
+               (buff, *plastiter, *plower, *pupper, *pstride, loc->psource));
+      __kmp_str_free(&buff);
+    }
+#endif
+    KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), 0, codeptr);
+    }
+#endif
+    KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+    return;
+  }
+
+  // Although there are schedule enumerations above kmp_ord_upper which are not
+  // schedules for "distribute", the only ones which are useful are dynamic, so
+  // cannot be seen here, since this codepath is only executed for static
+  // schedules.
+  if (schedtype > kmp_ord_upper) {
+    // we are in DISTRIBUTE construct
+    schedtype += kmp_sch_static -
+                 kmp_distribute_static; // AC: convert to usual schedule type
+    tid = th->th.th_team->t.t_master_tid;
+    team = th->th.th_team->t.t_parent;
+  } else {
+    tid = __kmp_tid_from_gtid(global_tid);
+    team = th->th.th_team;
+  }
+
+  /* determine if "for" loop is an active worksharing construct */
+  if (team->t.t_serialized) {
+    /* serialized parallel, each thread executes whole iteration space */
+    if (plastiter != NULL)
+      *plastiter = TRUE;
+    /* leave pupper and plower set to entire iteration space */
+    *pstride =
+        (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
+
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
+                              "lower=%%%s upper=%%%s stride = %%%s\n",
+                              traits_t<T>::spec, traits_t<T>::spec,
+                              traits_t<ST>::spec);
+      KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
+      __kmp_str_free(&buff);
+    }
+#endif
+    KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), *pstride, codeptr);
+    }
+#endif
+    KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+    return;
+  }
+  nth = team->t.t_nproc;
+  if (nth == 1) {
+    if (plastiter != NULL)
+      *plastiter = TRUE;
+    *pstride =
+        (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmpc_for_static_init: (serial) liter=%%d "
+                              "lower=%%%s upper=%%%s stride = %%%s\n",
+                              traits_t<T>::spec, traits_t<T>::spec,
+                              traits_t<ST>::spec);
+      KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
+      __kmp_str_free(&buff);
+    }
+#endif
+    KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt_enabled.ompt_callback_work) {
+      ompt_callbacks.ompt_callback(ompt_callback_work)(
+          ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+          &(task_info->task_data), *pstride, codeptr);
+    }
+#endif
+    KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+    return;
+  }
+
+  /* compute trip count */
+  if (incr == 1) {
+    trip_count = *pupper - *plower + 1;
+  } else if (incr == -1) {
+    trip_count = *plower - *pupper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(*pupper - *plower) / incr + 1;
+  } else {
+    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
+  }
+
+#if KMP_STATS_ENABLED
+  if (KMP_MASTER_GTID(gtid)) {
+    KMP_COUNT_VALUE(OMP_loop_static_total_iterations, trip_count);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    /* tripcount overflow? */
+    if (trip_count == 0 && *pupper != *plower) {
+      __kmp_error_construct(kmp_i18n_msg_CnsIterationRangeTooLarge, ct_pdo,
+                            loc);
+    }
+  }
+
+  /* compute remaining parameters */
+  switch (schedtype) {
+  case kmp_sch_static: {
+    if (trip_count < nth) {
+      KMP_DEBUG_ASSERT(
+          __kmp_static == kmp_sch_static_greedy ||
+          __kmp_static ==
+              kmp_sch_static_balanced); // Unknown static scheduling type.
+      if (tid < trip_count) {
+        *pupper = *plower = *plower + tid * incr;
+      } else {
+        *plower = *pupper + incr;
+      }
+      if (plastiter != NULL)
+        *plastiter = (tid == trip_count - 1);
+    } else {
+      if (__kmp_static == kmp_sch_static_balanced) {
+        UT small_chunk = trip_count / nth;
+        UT extras = trip_count % nth;
+        *plower += incr * (tid * small_chunk + (tid < extras ? tid : extras));
+        *pupper = *plower + small_chunk * incr - (tid < extras ? 0 : incr);
+        if (plastiter != NULL)
+          *plastiter = (tid == nth - 1);
+      } else {
+        T big_chunk_inc_count =
+            (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
+        T old_upper = *pupper;
+
+        KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+        // Unknown static scheduling type.
+
+        *plower += tid * big_chunk_inc_count;
+        *pupper = *plower + big_chunk_inc_count - incr;
+        if (incr > 0) {
+          if (*pupper < *plower)
+            *pupper = traits_t<T>::max_value;
+          if (plastiter != NULL)
+            *plastiter = *plower <= old_upper && *pupper > old_upper - incr;
+          if (*pupper > old_upper)
+            *pupper = old_upper; // tracker C73258
+        } else {
+          if (*pupper > *plower)
+            *pupper = traits_t<T>::min_value;
+          if (plastiter != NULL)
+            *plastiter = *plower >= old_upper && *pupper < old_upper - incr;
+          if (*pupper < old_upper)
+            *pupper = old_upper; // tracker C73258
+        }
+      }
+    }
+    *pstride = trip_count;
+    break;
+  }
+  case kmp_sch_static_chunked: {
+    ST span;
+    if (chunk < 1) {
+      chunk = 1;
+    }
+    span = chunk * incr;
+    *pstride = span * nth;
+    *plower = *plower + (span * tid);
+    *pupper = *plower + span - incr;
+    if (plastiter != NULL)
+      *plastiter = (tid == ((trip_count - 1) / (UT)chunk) % nth);
+    break;
+  }
+  case kmp_sch_static_balanced_chunked: {
+    T old_upper = *pupper;
+    // round up to make sure the chunk is enough to cover all iterations
+    UT span = (trip_count + nth - 1) / nth;
+
+    // perform chunk adjustment
+    chunk = (span + chunk - 1) & ~(chunk - 1);
+
+    span = chunk * incr;
+    *plower = *plower + (span * tid);
+    *pupper = *plower + span - incr;
+    if (incr > 0) {
+      if (*pupper > old_upper)
+        *pupper = old_upper;
+    } else if (*pupper < old_upper)
+      *pupper = old_upper;
+
+    if (plastiter != NULL)
+      *plastiter = (tid == ((trip_count - 1) / (UT)chunk));
+    break;
+  }
+  default:
+    KMP_ASSERT2(0, "__kmpc_for_static_init: unknown scheduling type");
+    break;
+  }
+
+#if USE_ITT_BUILD
+  // Report loop metadata
+  if (KMP_MASTER_TID(tid) && __itt_metadata_add_ptr &&
+      __kmp_forkjoin_frames_mode == 3 && th->th.th_teams_microtask == NULL &&
+      team->t.t_active_level == 1) {
+    kmp_uint64 cur_chunk = chunk;
+    // Calculate chunk in case it was not specified; it is specified for
+    // kmp_sch_static_chunked
+    if (schedtype == kmp_sch_static) {
+      cur_chunk = trip_count / nth + ((trip_count % nth) ? 1 : 0);
+    }
+    // 0 - "static" schedule
+    __kmp_itt_metadata_loop(loc, 0, trip_count, cur_chunk);
+  }
+#endif
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmpc_for_static_init: liter=%%d lower=%%%s "
+                            "upper=%%%s stride = %%%s signed?<%s>\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pstride));
+    __kmp_str_free(&buff);
+  }
+#endif
+  KE_TRACE(10, ("__kmpc_for_static_init: T#%d return\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_type, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), trip_count, codeptr);
+  }
+#endif
+
+  KMP_STATS_LOOP_END(OMP_loop_static_iterations);
+  return;
+}
+
+template <typename T>
+static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
+                                       kmp_int32 schedule, kmp_int32 *plastiter,
+                                       T *plower, T *pupper, T *pupperDist,
+                                       typename traits_t<T>::signed_t *pstride,
+                                       typename traits_t<T>::signed_t incr,
+                                       typename traits_t<T>::signed_t chunk) {
+  KMP_COUNT_BLOCK(OMP_DISTRIBUTE);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_distribute);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_distribute_scheduling);
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  kmp_uint32 tid;
+  kmp_uint32 nth;
+  kmp_uint32 team_id;
+  kmp_uint32 nteams;
+  UT trip_count;
+  kmp_team_t *team;
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(plastiter && plower && pupper && pupperDist && pstride);
+  KE_TRACE(10, ("__kmpc_dist_for_static_init called (%d)\n", gtid));
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmpc_dist_for_static_init: T#%%d schedLoop=%%d liter=%%d "
+        "iter=(%%%s, %%%s, %%%s) chunk=%%%s signed?<%s>\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec,
+        traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100,
+             (buff, gtid, schedule, *plastiter, *plower, *pupper, incr, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  if (__kmp_env_consistency_check) {
+    __kmp_push_workshare(gtid, ct_pdo, loc);
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
+      // The loop is illegal.
+      // Some zero-trip loops maintained by compiler, e.g.:
+      //   for(i=10;i<0;++i) // lower >= upper - run-time check
+      //   for(i=0;i>10;--i) // lower <= upper - run-time check
+      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+      // Compiler does not check the following illegal loops:
+      //   for(i=0;i<10;i+=incr) // where incr<0
+      //   for(i=10;i>0;i-=incr) // where incr<0
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
+    }
+  }
+  tid = __kmp_tid_from_gtid(gtid);
+  th = __kmp_threads[gtid];
+  nth = th->th.th_team_nproc;
+  team = th->th.th_team;
+  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
+  nteams = th->th.th_teams_size.nteams;
+  team_id = team->t.t_master_tid;
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
+
+  // compute global trip count
+  if (incr == 1) {
+    trip_count = *pupper - *plower + 1;
+  } else if (incr == -1) {
+    trip_count = *plower - *pupper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(*pupper - *plower) / incr + 1;
+  } else {
+    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
+  }
+
+  *pstride = *pupper - *plower; // just in case (can be unused)
+  if (trip_count <= nteams) {
+    KMP_DEBUG_ASSERT(
+        __kmp_static == kmp_sch_static_greedy ||
+        __kmp_static ==
+            kmp_sch_static_balanced); // Unknown static scheduling type.
+    // only masters of some teams get single iteration, other threads get
+    // nothing
+    if (team_id < trip_count && tid == 0) {
+      *pupper = *pupperDist = *plower = *plower + team_id * incr;
+    } else {
+      *pupperDist = *pupper;
+      *plower = *pupper + incr; // compiler should skip loop body
+    }
+    if (plastiter != NULL)
+      *plastiter = (tid == 0 && team_id == trip_count - 1);
+  } else {
+    // Get the team's chunk first (each team gets at most one chunk)
+    if (__kmp_static == kmp_sch_static_balanced) {
+      UT chunkD = trip_count / nteams;
+      UT extras = trip_count % nteams;
+      *plower +=
+          incr * (team_id * chunkD + (team_id < extras ? team_id : extras));
+      *pupperDist = *plower + chunkD * incr - (team_id < extras ? 0 : incr);
+      if (plastiter != NULL)
+        *plastiter = (team_id == nteams - 1);
+    } else {
+      T chunk_inc_count =
+          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
+      T upper = *pupper;
+      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+      // Unknown static scheduling type.
+      *plower += team_id * chunk_inc_count;
+      *pupperDist = *plower + chunk_inc_count - incr;
+      // Check/correct bounds if needed
+      if (incr > 0) {
+        if (*pupperDist < *plower)
+          *pupperDist = traits_t<T>::max_value;
+        if (plastiter != NULL)
+          *plastiter = *plower <= upper && *pupperDist > upper - incr;
+        if (*pupperDist > upper)
+          *pupperDist = upper; // tracker C73258
+        if (*plower > *pupperDist) {
+          *pupper = *pupperDist; // no iterations available for the team
+          goto end;
+        }
+      } else {
+        if (*pupperDist > *plower)
+          *pupperDist = traits_t<T>::min_value;
+        if (plastiter != NULL)
+          *plastiter = *plower >= upper && *pupperDist < upper - incr;
+        if (*pupperDist < upper)
+          *pupperDist = upper; // tracker C73258
+        if (*plower < *pupperDist) {
+          *pupper = *pupperDist; // no iterations available for the team
+          goto end;
+        }
+      }
+    }
+    // Get the parallel loop chunk now (for thread)
+    // compute trip count for team's chunk
+    if (incr == 1) {
+      trip_count = *pupperDist - *plower + 1;
+    } else if (incr == -1) {
+      trip_count = *plower - *pupperDist + 1;
+    } else if (incr > 1) {
+      // upper-lower can exceed the limit of signed type
+      trip_count = (UT)(*pupperDist - *plower) / incr + 1;
+    } else {
+      trip_count = (UT)(*plower - *pupperDist) / (-incr) + 1;
+    }
+    KMP_DEBUG_ASSERT(trip_count);
+    switch (schedule) {
+    case kmp_sch_static: {
+      if (trip_count <= nth) {
+        KMP_DEBUG_ASSERT(
+            __kmp_static == kmp_sch_static_greedy ||
+            __kmp_static ==
+                kmp_sch_static_balanced); // Unknown static scheduling type.
+        if (tid < trip_count)
+          *pupper = *plower = *plower + tid * incr;
+        else
+          *plower = *pupper + incr; // no iterations available
+        if (plastiter != NULL)
+          if (*plastiter != 0 && !(tid == trip_count - 1))
+            *plastiter = 0;
+      } else {
+        if (__kmp_static == kmp_sch_static_balanced) {
+          UT chunkL = trip_count / nth;
+          UT extras = trip_count % nth;
+          *plower += incr * (tid * chunkL + (tid < extras ? tid : extras));
+          *pupper = *plower + chunkL * incr - (tid < extras ? 0 : incr);
+          if (plastiter != NULL)
+            if (*plastiter != 0 && !(tid == nth - 1))
+              *plastiter = 0;
+        } else {
+          T chunk_inc_count =
+              (trip_count / nth + ((trip_count % nth) ? 1 : 0)) * incr;
+          T upper = *pupperDist;
+          KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
+          // Unknown static scheduling type.
+          *plower += tid * chunk_inc_count;
+          *pupper = *plower + chunk_inc_count - incr;
+          if (incr > 0) {
+            if (*pupper < *plower)
+              *pupper = traits_t<T>::max_value;
+            if (plastiter != NULL)
+              if (*plastiter != 0 &&
+                  !(*plower <= upper && *pupper > upper - incr))
+                *plastiter = 0;
+            if (*pupper > upper)
+              *pupper = upper; // tracker C73258
+          } else {
+            if (*pupper > *plower)
+              *pupper = traits_t<T>::min_value;
+            if (plastiter != NULL)
+              if (*plastiter != 0 &&
+                  !(*plower >= upper && *pupper < upper - incr))
+                *plastiter = 0;
+            if (*pupper < upper)
+              *pupper = upper; // tracker C73258
+          }
+        }
+      }
+      break;
+    }
+    case kmp_sch_static_chunked: {
+      ST span;
+      if (chunk < 1)
+        chunk = 1;
+      span = chunk * incr;
+      *pstride = span * nth;
+      *plower = *plower + (span * tid);
+      *pupper = *plower + span - incr;
+      if (plastiter != NULL)
+        if (*plastiter != 0 && !(tid == ((trip_count - 1) / (UT)chunk) % nth))
+          *plastiter = 0;
+      break;
+    }
+    default:
+      KMP_ASSERT2(0,
+                  "__kmpc_dist_for_static_init: unknown loop scheduling type");
+      break;
+    }
+  }
+end:;
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmpc_dist_for_static_init: last=%%d lo=%%%s up=%%%s upDist=%%%s "
+        "stride=%%%s signed?<%s>\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec,
+        traits_t<ST>::spec, traits_t<T>::spec);
+    KD_TRACE(100, (buff, *plastiter, *plower, *pupper, *pupperDist, *pstride));
+    __kmp_str_free(&buff);
+  }
+#endif
+  KE_TRACE(10, ("__kmpc_dist_for_static_init: T#%d return\n", gtid));
+  KMP_STATS_LOOP_END(OMP_distribute_iterations);
+  return;
+}
+
+template <typename T>
+static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
+                                   typename traits_t<T>::signed_t *p_st,
+                                   typename traits_t<T>::signed_t incr,
+                                   typename traits_t<T>::signed_t chunk) {
+  // The routine returns the first chunk distributed to the team and
+  // stride for next chunks calculation.
+  // Last iteration flag set for the team that will execute
+  // the last iteration of the loop.
+  // The routine is called for dist_schedue(static,chunk) only.
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  kmp_uint32 team_id;
+  kmp_uint32 nteams;
+  UT trip_count;
+  T lower;
+  T upper;
+  ST span;
+  kmp_team_t *team;
+  kmp_info_t *th;
+
+  KMP_DEBUG_ASSERT(p_last && p_lb && p_ub && p_st);
+  KE_TRACE(10, ("__kmp_team_static_init called (%d)\n", gtid));
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_team_static_init enter: T#%%d liter=%%d "
+                            "iter=(%%%s, %%%s, %%%s) chunk %%%s; signed?<%s>\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<ST>::spec,
+                            traits_t<T>::spec);
+    KD_TRACE(100, (buff, gtid, *p_last, *p_lb, *p_ub, *p_st, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+
+  lower = *p_lb;
+  upper = *p_ub;
+  if (__kmp_env_consistency_check) {
+    if (incr == 0) {
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
+                            loc);
+    }
+    if (incr > 0 ? (upper < lower) : (lower < upper)) {
+      // The loop is illegal.
+      // Some zero-trip loops maintained by compiler, e.g.:
+      //   for(i=10;i<0;++i) // lower >= upper - run-time check
+      //   for(i=0;i>10;--i) // lower <= upper - run-time check
+      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+      // Compiler does not check the following illegal loops:
+      //   for(i=0;i<10;i+=incr) // where incr<0
+      //   for(i=10;i>0;i-=incr) // where incr<0
+      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
+    }
+  }
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
+  nteams = th->th.th_teams_size.nteams;
+  team_id = team->t.t_master_tid;
+  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
+
+  // compute trip count
+  if (incr == 1) {
+    trip_count = upper - lower + 1;
+  } else if (incr == -1) {
+    trip_count = lower - upper + 1;
+  } else if (incr > 0) {
+    // upper-lower can exceed the limit of signed type
+    trip_count = (UT)(upper - lower) / incr + 1;
+  } else {
+    trip_count = (UT)(lower - upper) / (-incr) + 1;
+  }
+  if (chunk < 1)
+    chunk = 1;
+  span = chunk * incr;
+  *p_st = span * nteams;
+  *p_lb = lower + (span * team_id);
+  *p_ub = *p_lb + span - incr;
+  if (p_last != NULL)
+    *p_last = (team_id == ((trip_count - 1) / (UT)chunk) % nteams);
+  // Correct upper bound if needed
+  if (incr > 0) {
+    if (*p_ub < *p_lb) // overflow?
+      *p_ub = traits_t<T>::max_value;
+    if (*p_ub > upper)
+      *p_ub = upper; // tracker C73258
+  } else { // incr < 0
+    if (*p_ub > *p_lb)
+      *p_ub = traits_t<T>::min_value;
+    if (*p_ub < upper)
+      *p_ub = upper; // tracker C73258
+  }
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff =
+        __kmp_str_format("__kmp_team_static_init exit: T#%%d team%%u liter=%%d "
+                         "iter=(%%%s, %%%s, %%%s) chunk %%%s\n",
+                         traits_t<T>::spec, traits_t<T>::spec,
+                         traits_t<ST>::spec, traits_t<ST>::spec);
+    KD_TRACE(100, (buff, gtid, team_id, *p_last, *p_lb, *p_ub, *p_st, chunk));
+    __kmp_str_free(&buff);
+  }
+#endif
+}
+
+//------------------------------------------------------------------------------
+extern "C" {
+/*!
+@ingroup WORK_SHARING
+@param    loc       Source code location
+@param    gtid      Global thread id of this thread
+@param    schedtype  Scheduling type
+@param    plastiter Pointer to the "last iteration" flag
+@param    plower    Pointer to the lower bound
+@param    pupper    Pointer to the upper bound
+@param    pstride   Pointer to the stride
+@param    incr      Loop increment
+@param    chunk     The chunk size
+
+Each of the four functions here are identical apart from the argument types.
+
+The functions compute the upper and lower bounds and stride to be used for the
+set of iterations to be executed by the current thread from the statically
+scheduled loop that is described by the initial values of the bounds, stride,
+increment and chunk size.
+
+@{
+*/
+void __kmpc_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
+                              kmp_int32 *plastiter, kmp_int32 *plower,
+                              kmp_int32 *pupper, kmp_int32 *pstride,
+                              kmp_int32 incr, kmp_int32 chunk) {
+  __kmp_for_static_init<kmp_int32>(loc, gtid, schedtype, plastiter, plower,
+                                   pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                   ,
+                                   OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                       );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void __kmpc_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
+                               kmp_int32 schedtype, kmp_int32 *plastiter,
+                               kmp_uint32 *plower, kmp_uint32 *pupper,
+                               kmp_int32 *pstride, kmp_int32 incr,
+                               kmp_int32 chunk) {
+  __kmp_for_static_init<kmp_uint32>(loc, gtid, schedtype, plastiter, plower,
+                                    pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                    ,
+                                    OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                        );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void __kmpc_for_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
+                              kmp_int32 *plastiter, kmp_int64 *plower,
+                              kmp_int64 *pupper, kmp_int64 *pstride,
+                              kmp_int64 incr, kmp_int64 chunk) {
+  __kmp_for_static_init<kmp_int64>(loc, gtid, schedtype, plastiter, plower,
+                                   pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                   ,
+                                   OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                       );
+}
+
+/*!
+ See @ref __kmpc_for_static_init_4
+ */
+void __kmpc_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
+                               kmp_int32 schedtype, kmp_int32 *plastiter,
+                               kmp_uint64 *plower, kmp_uint64 *pupper,
+                               kmp_int64 *pstride, kmp_int64 incr,
+                               kmp_int64 chunk) {
+  __kmp_for_static_init<kmp_uint64>(loc, gtid, schedtype, plastiter, plower,
+                                    pupper, pstride, incr, chunk
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                                    ,
+                                    OMPT_GET_RETURN_ADDRESS(0)
+#endif
+                                        );
+}
+/*!
+@}
+*/
+
+/*!
+@ingroup WORK_SHARING
+@param    loc       Source code location
+@param    gtid      Global thread id of this thread
+@param    schedule  Scheduling type for the parallel loop
+@param    plastiter Pointer to the "last iteration" flag
+@param    plower    Pointer to the lower bound
+@param    pupper    Pointer to the upper bound of loop chunk
+@param    pupperD   Pointer to the upper bound of dist_chunk
+@param    pstride   Pointer to the stride for parallel loop
+@param    incr      Loop increment
+@param    chunk     The chunk size for the parallel loop
+
+Each of the four functions here are identical apart from the argument types.
+
+The functions compute the upper and lower bounds and strides to be used for the
+set of iterations to be executed by the current thread from the statically
+scheduled loop that is described by the initial values of the bounds, strides,
+increment and chunks for parallel loop and distribute constructs.
+
+@{
+*/
+void __kmpc_dist_for_static_init_4(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 schedule, kmp_int32 *plastiter,
+                                   kmp_int32 *plower, kmp_int32 *pupper,
+                                   kmp_int32 *pupperD, kmp_int32 *pstride,
+                                   kmp_int32 incr, kmp_int32 chunk) {
+  __kmp_dist_for_static_init<kmp_int32>(loc, gtid, schedule, plastiter, plower,
+                                        pupper, pupperD, pstride, incr, chunk);
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void __kmpc_dist_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
+                                    kmp_int32 schedule, kmp_int32 *plastiter,
+                                    kmp_uint32 *plower, kmp_uint32 *pupper,
+                                    kmp_uint32 *pupperD, kmp_int32 *pstride,
+                                    kmp_int32 incr, kmp_int32 chunk) {
+  __kmp_dist_for_static_init<kmp_uint32>(loc, gtid, schedule, plastiter, plower,
+                                         pupper, pupperD, pstride, incr, chunk);
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void __kmpc_dist_for_static_init_8(ident_t *loc, kmp_int32 gtid,
+                                   kmp_int32 schedule, kmp_int32 *plastiter,
+                                   kmp_int64 *plower, kmp_int64 *pupper,
+                                   kmp_int64 *pupperD, kmp_int64 *pstride,
+                                   kmp_int64 incr, kmp_int64 chunk) {
+  __kmp_dist_for_static_init<kmp_int64>(loc, gtid, schedule, plastiter, plower,
+                                        pupper, pupperD, pstride, incr, chunk);
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void __kmpc_dist_for_static_init_8u(ident_t *loc, kmp_int32 gtid,
+                                    kmp_int32 schedule, kmp_int32 *plastiter,
+                                    kmp_uint64 *plower, kmp_uint64 *pupper,
+                                    kmp_uint64 *pupperD, kmp_int64 *pstride,
+                                    kmp_int64 incr, kmp_int64 chunk) {
+  __kmp_dist_for_static_init<kmp_uint64>(loc, gtid, schedule, plastiter, plower,
+                                         pupper, pupperD, pstride, incr, chunk);
+}
+/*!
+@}
+*/
+
+//------------------------------------------------------------------------------
+// Auxiliary routines for Distribute Parallel Loop construct implementation
+//    Transfer call to template< type T >
+//    __kmp_team_static_init( ident_t *loc, int gtid,
+//        int *p_last, T *lb, T *ub, ST *st, ST incr, ST chunk )
+
+/*!
+@ingroup WORK_SHARING
+@{
+@param loc Source location
+@param gtid Global thread id
+@param p_last pointer to last iteration flag
+@param p_lb  pointer to Lower bound
+@param p_ub  pointer to Upper bound
+@param p_st  Step (or increment if you prefer)
+@param incr  Loop increment
+@param chunk The chunk size to block with
+
+The functions compute the upper and lower bounds and stride to be used for the
+set of iterations to be executed by the current team from the statically
+scheduled loop that is described by the initial values of the bounds, stride,
+increment and chunk for the distribute construct as part of composite distribute
+parallel loop construct. These functions are all identical apart from the types
+of the arguments.
+*/
+
+void __kmpc_team_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                               kmp_int32 *p_lb, kmp_int32 *p_ub,
+                               kmp_int32 *p_st, kmp_int32 incr,
+                               kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                    chunk);
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void __kmpc_team_static_init_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                                kmp_uint32 *p_lb, kmp_uint32 *p_ub,
+                                kmp_int32 *p_st, kmp_int32 incr,
+                                kmp_int32 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                     chunk);
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void __kmpc_team_static_init_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                               kmp_int64 *p_lb, kmp_int64 *p_ub,
+                               kmp_int64 *p_st, kmp_int64 incr,
+                               kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                    chunk);
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void __kmpc_team_static_init_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+                                kmp_uint64 *p_lb, kmp_uint64 *p_ub,
+                                kmp_int64 *p_st, kmp_int64 incr,
+                                kmp_int64 chunk) {
+  KMP_DEBUG_ASSERT(__kmp_init_serial);
+  __kmp_team_static_init<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st, incr,
+                                     chunk);
+}
+/*!
+@}
+*/
+
+} // extern "C"
diff --git a/final/runtime/src/kmp_settings.cpp b/final/runtime/src/kmp_settings.cpp
new file mode 100644
index 0000000..692ca26
--- /dev/null
+++ b/final/runtime/src/kmp_settings.cpp
@@ -0,0 +1,5760 @@
+/*
+ * kmp_settings.cpp -- Initialize environment variables
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_atomic.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
+#include "kmp_environment.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_settings.h"
+#include "kmp_str.h"
+#include "kmp_wrapper_getpid.h"
+#include <ctype.h> // toupper()
+
+static int __kmp_env_toPrint(char const *name, int flag);
+
+bool __kmp_env_format = 0; // 0 - old format; 1 - new format
+
+// -----------------------------------------------------------------------------
+// Helper string functions. Subject to move to kmp_str.
+
+#ifdef USE_LOAD_BALANCE
+static double __kmp_convert_to_double(char const *s) {
+  double result;
+
+  if (KMP_SSCANF(s, "%lf", &result) < 1) {
+    result = 0.0;
+  }
+
+  return result;
+}
+#endif
+
+#ifdef KMP_DEBUG
+static unsigned int __kmp_readstr_with_sentinel(char *dest, char const *src,
+                                                size_t len, char sentinel) {
+  unsigned int i;
+  for (i = 0; i < len; i++) {
+    if ((*src == '\0') || (*src == sentinel)) {
+      break;
+    }
+    *(dest++) = *(src++);
+  }
+  *dest = '\0';
+  return i;
+}
+#endif
+
+static int __kmp_match_with_sentinel(char const *a, char const *b, size_t len,
+                                     char sentinel) {
+  size_t l = 0;
+
+  if (a == NULL)
+    a = "";
+  if (b == NULL)
+    b = "";
+  while (*a && *b && *b != sentinel) {
+    char ca = *a, cb = *b;
+
+    if (ca >= 'a' && ca <= 'z')
+      ca -= 'a' - 'A';
+    if (cb >= 'a' && cb <= 'z')
+      cb -= 'a' - 'A';
+    if (ca != cb)
+      return FALSE;
+    ++l;
+    ++a;
+    ++b;
+  }
+  return l >= len;
+}
+
+// Expected usage:
+//     token is the token to check for.
+//     buf is the string being parsed.
+//     *end returns the char after the end of the token.
+//        it is not modified unless a match occurs.
+//
+// Example 1:
+//
+//     if (__kmp_match_str("token", buf, *end) {
+//         <do something>
+//         buf = end;
+//     }
+//
+//  Example 2:
+//
+//     if (__kmp_match_str("token", buf, *end) {
+//         char *save = **end;
+//         **end = sentinel;
+//         <use any of the __kmp*_with_sentinel() functions>
+//         **end = save;
+//         buf = end;
+//     }
+
+static int __kmp_match_str(char const *token, char const *buf,
+                           const char **end) {
+
+  KMP_ASSERT(token != NULL);
+  KMP_ASSERT(buf != NULL);
+  KMP_ASSERT(end != NULL);
+
+  while (*token && *buf) {
+    char ct = *token, cb = *buf;
+
+    if (ct >= 'a' && ct <= 'z')
+      ct -= 'a' - 'A';
+    if (cb >= 'a' && cb <= 'z')
+      cb -= 'a' - 'A';
+    if (ct != cb)
+      return FALSE;
+    ++token;
+    ++buf;
+  }
+  if (*token) {
+    return FALSE;
+  }
+  *end = buf;
+  return TRUE;
+}
+
+#if KMP_OS_DARWIN
+static size_t __kmp_round4k(size_t size) {
+  size_t _4k = 4 * 1024;
+  if (size & (_4k - 1)) {
+    size &= ~(_4k - 1);
+    if (size <= KMP_SIZE_T_MAX - _4k) {
+      size += _4k; // Round up if there is no overflow.
+    }
+  }
+  return size;
+} // __kmp_round4k
+#endif
+
+/* Here, multipliers are like __kmp_convert_to_seconds, but floating-point
+   values are allowed, and the return value is in milliseconds.  The default
+   multiplier is milliseconds.  Returns INT_MAX only if the value specified
+   matches "infinit*".  Returns -1 if specified string is invalid. */
+int __kmp_convert_to_milliseconds(char const *data) {
+  int ret, nvalues, factor;
+  char mult, extra;
+  double value;
+
+  if (data == NULL)
+    return (-1);
+  if (__kmp_str_match("infinit", -1, data))
+    return (INT_MAX);
+  value = (double)0.0;
+  mult = '\0';
+  nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra);
+  if (nvalues < 1)
+    return (-1);
+  if (nvalues == 1)
+    mult = '\0';
+  if (nvalues == 3)
+    return (-1);
+
+  if (value < 0)
+    return (-1);
+
+  switch (mult) {
+  case '\0':
+    /*  default is milliseconds  */
+    factor = 1;
+    break;
+  case 's':
+  case 'S':
+    factor = 1000;
+    break;
+  case 'm':
+  case 'M':
+    factor = 1000 * 60;
+    break;
+  case 'h':
+  case 'H':
+    factor = 1000 * 60 * 60;
+    break;
+  case 'd':
+  case 'D':
+    factor = 1000 * 24 * 60 * 60;
+    break;
+  default:
+    return (-1);
+  }
+
+  if (value >= ((INT_MAX - 1) / factor))
+    ret = INT_MAX - 1; /* Don't allow infinite value here */
+  else
+    ret = (int)(value * (double)factor); /* truncate to int  */
+
+  return ret;
+}
+
+static int __kmp_strcasecmp_with_sentinel(char const *a, char const *b,
+                                          char sentinel) {
+  if (a == NULL)
+    a = "";
+  if (b == NULL)
+    b = "";
+  while (*a && *b && *b != sentinel) {
+    char ca = *a, cb = *b;
+
+    if (ca >= 'a' && ca <= 'z')
+      ca -= 'a' - 'A';
+    if (cb >= 'a' && cb <= 'z')
+      cb -= 'a' - 'A';
+    if (ca != cb)
+      return (int)(unsigned char)*a - (int)(unsigned char)*b;
+    ++a;
+    ++b;
+  }
+  return *a
+             ? (*b && *b != sentinel)
+                   ? (int)(unsigned char)*a - (int)(unsigned char)*b
+                   : 1
+             : (*b && *b != sentinel) ? -1 : 0;
+}
+
+// =============================================================================
+// Table structures and helper functions.
+
+typedef struct __kmp_setting kmp_setting_t;
+typedef struct __kmp_stg_ss_data kmp_stg_ss_data_t;
+typedef struct __kmp_stg_wp_data kmp_stg_wp_data_t;
+typedef struct __kmp_stg_fr_data kmp_stg_fr_data_t;
+
+typedef void (*kmp_stg_parse_func_t)(char const *name, char const *value,
+                                     void *data);
+typedef void (*kmp_stg_print_func_t)(kmp_str_buf_t *buffer, char const *name,
+                                     void *data);
+
+struct __kmp_setting {
+  char const *name; // Name of setting (environment variable).
+  kmp_stg_parse_func_t parse; // Parser function.
+  kmp_stg_print_func_t print; // Print function.
+  void *data; // Data passed to parser and printer.
+  int set; // Variable set during this "session"
+  //     (__kmp_env_initialize() or kmp_set_defaults() call).
+  int defined; // Variable set in any "session".
+}; // struct __kmp_setting
+
+struct __kmp_stg_ss_data {
+  size_t factor; // Default factor: 1 for KMP_STACKSIZE, 1024 for others.
+  kmp_setting_t **rivals; // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_ss_data
+
+struct __kmp_stg_wp_data {
+  int omp; // 0 -- KMP_LIBRARY, 1 -- OMP_WAIT_POLICY.
+  kmp_setting_t **rivals; // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_wp_data
+
+struct __kmp_stg_fr_data {
+  int force; // 0 -- KMP_DETERMINISTIC_REDUCTION, 1 -- KMP_FORCE_REDUCTION.
+  kmp_setting_t **rivals; // Array of pointers to rivals (including itself).
+}; // struct __kmp_stg_fr_data
+
+static int __kmp_stg_check_rivals( // 0 -- Ok, 1 -- errors found.
+    char const *name, // Name of variable.
+    char const *value, // Value of the variable.
+    kmp_setting_t **rivals // List of rival settings (must include current one).
+    );
+
+// -----------------------------------------------------------------------------
+// Helper parse functions.
+
+static void __kmp_stg_parse_bool(char const *name, char const *value,
+                                 int *out) {
+  if (__kmp_str_match_true(value)) {
+    *out = TRUE;
+  } else if (__kmp_str_match_false(value)) {
+    *out = FALSE;
+  } else {
+    __kmp_msg(kmp_ms_warning, KMP_MSG(BadBoolValue, name, value),
+              KMP_HNT(ValidBoolValues), __kmp_msg_null);
+  }
+} // __kmp_stg_parse_bool
+
+// placed here in order to use __kmp_round4k static function
+void __kmp_check_stksize(size_t *val) {
+  // if system stack size is too big then limit the size for worker threads
+  if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics...
+    *val = KMP_DEFAULT_STKSIZE * 16;
+  if (*val < KMP_MIN_STKSIZE)
+    *val = KMP_MIN_STKSIZE;
+  if (*val > KMP_MAX_STKSIZE)
+    *val = KMP_MAX_STKSIZE; // dead code currently, but may work in future
+#if KMP_OS_DARWIN
+  *val = __kmp_round4k(*val);
+#endif // KMP_OS_DARWIN
+}
+
+static void __kmp_stg_parse_size(char const *name, char const *value,
+                                 size_t size_min, size_t size_max,
+                                 int *is_specified, size_t *out,
+                                 size_t factor) {
+  char const *msg = NULL;
+#if KMP_OS_DARWIN
+  size_min = __kmp_round4k(size_min);
+  size_max = __kmp_round4k(size_max);
+#endif // KMP_OS_DARWIN
+  if (value) {
+    if (is_specified != NULL) {
+      *is_specified = 1;
+    }
+    __kmp_str_to_size(value, out, factor, &msg);
+    if (msg == NULL) {
+      if (*out > size_max) {
+        *out = size_max;
+        msg = KMP_I18N_STR(ValueTooLarge);
+      } else if (*out < size_min) {
+        *out = size_min;
+        msg = KMP_I18N_STR(ValueTooSmall);
+      } else {
+#if KMP_OS_DARWIN
+        size_t round4k = __kmp_round4k(*out);
+        if (*out != round4k) {
+          *out = round4k;
+          msg = KMP_I18N_STR(NotMultiple4K);
+        }
+#endif
+      }
+    } else {
+      // If integer overflow occurred, * out == KMP_SIZE_T_MAX. Cut it to
+      // size_max silently.
+      if (*out < size_min) {
+        *out = size_max;
+      } else if (*out > size_max) {
+        *out = size_max;
+      }
+    }
+    if (msg != NULL) {
+      // Message is not empty. Print warning.
+      kmp_str_buf_t buf;
+      __kmp_str_buf_init(&buf);
+      __kmp_str_buf_print_size(&buf, *out);
+      KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+      KMP_INFORM(Using_str_Value, name, buf.str);
+      __kmp_str_buf_free(&buf);
+    }
+  }
+} // __kmp_stg_parse_size
+
+static void __kmp_stg_parse_str(char const *name, char const *value,
+                                char **out) {
+  __kmp_str_free(out);
+  *out = __kmp_str_format("%s", value);
+} // __kmp_stg_parse_str
+
+static void __kmp_stg_parse_int(
+    char const
+        *name, // I: Name of environment variable (used in warning messages).
+    char const *value, // I: Value of environment variable to parse.
+    int min, // I: Miminal allowed value.
+    int max, // I: Maximum allowed value.
+    int *out // O: Output (parsed) value.
+    ) {
+  char const *msg = NULL;
+  kmp_uint64 uint = *out;
+  __kmp_str_to_uint(value, &uint, &msg);
+  if (msg == NULL) {
+    if (uint < (unsigned int)min) {
+      msg = KMP_I18N_STR(ValueTooSmall);
+      uint = min;
+    } else if (uint > (unsigned int)max) {
+      msg = KMP_I18N_STR(ValueTooLarge);
+      uint = max;
+    }
+  } else {
+    // If overflow occurred msg contains error message and uint is very big. Cut
+    // tmp it to INT_MAX.
+    if (uint < (unsigned int)min) {
+      uint = min;
+    } else if (uint > (unsigned int)max) {
+      uint = max;
+    }
+  }
+  if (msg != NULL) {
+    // Message is not empty. Print warning.
+    kmp_str_buf_t buf;
+    KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+    __kmp_str_buf_init(&buf);
+    __kmp_str_buf_print(&buf, "%" KMP_UINT64_SPEC "", uint);
+    KMP_INFORM(Using_uint64_Value, name, buf.str);
+    __kmp_str_buf_free(&buf);
+  }
+  *out = uint;
+} // __kmp_stg_parse_int
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+static void __kmp_stg_parse_file(char const *name, char const *value,
+                                 const char *suffix, char **out) {
+  char buffer[256];
+  char *t;
+  int hasSuffix;
+  __kmp_str_free(out);
+  t = (char *)strrchr(value, '.');
+  hasSuffix = t && __kmp_str_eqf(t, suffix);
+  t = __kmp_str_format("%s%s", value, hasSuffix ? "" : suffix);
+  __kmp_expand_file_name(buffer, sizeof(buffer), t);
+  __kmp_str_free(&t);
+  *out = __kmp_str_format("%s", buffer);
+} // __kmp_stg_parse_file
+#endif
+
+#ifdef KMP_DEBUG
+static char *par_range_to_print = NULL;
+
+static void __kmp_stg_parse_par_range(char const *name, char const *value,
+                                      int *out_range, char *out_routine,
+                                      char *out_file, int *out_lb,
+                                      int *out_ub) {
+  size_t len = KMP_STRLEN(value) + 1;
+  par_range_to_print = (char *)KMP_INTERNAL_MALLOC(len + 1);
+  KMP_STRNCPY_S(par_range_to_print, len + 1, value, len + 1);
+  __kmp_par_range = +1;
+  __kmp_par_range_lb = 0;
+  __kmp_par_range_ub = INT_MAX;
+  for (;;) {
+    unsigned int len;
+    if (*value == '\0') {
+      break;
+    }
+    if (!__kmp_strcasecmp_with_sentinel("routine", value, '=')) {
+      value = strchr(value, '=') + 1;
+      len = __kmp_readstr_with_sentinel(out_routine, value,
+                                        KMP_PAR_RANGE_ROUTINE_LEN - 1, ',');
+      if (len == 0) {
+        goto par_range_error;
+      }
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+    if (!__kmp_strcasecmp_with_sentinel("filename", value, '=')) {
+      value = strchr(value, '=') + 1;
+      len = __kmp_readstr_with_sentinel(out_file, value,
+                                        KMP_PAR_RANGE_FILENAME_LEN - 1, ',');
+      if (len == 0) {
+        goto par_range_error;
+      }
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+    if ((!__kmp_strcasecmp_with_sentinel("range", value, '=')) ||
+        (!__kmp_strcasecmp_with_sentinel("incl_range", value, '='))) {
+      value = strchr(value, '=') + 1;
+      if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
+        goto par_range_error;
+      }
+      *out_range = +1;
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+    if (!__kmp_strcasecmp_with_sentinel("excl_range", value, '=')) {
+      value = strchr(value, '=') + 1;
+      if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
+        goto par_range_error;
+      }
+      *out_range = -1;
+      value = strchr(value, ',');
+      if (value != NULL) {
+        value++;
+      }
+      continue;
+    }
+  par_range_error:
+    KMP_WARNING(ParRangeSyntax, name);
+    __kmp_par_range = 0;
+    break;
+  }
+} // __kmp_stg_parse_par_range
+#endif
+
+int __kmp_initial_threads_capacity(int req_nproc) {
+  int nth = 32;
+
+  /* MIN( MAX( 32, 4 * $OMP_NUM_THREADS, 4 * omp_get_num_procs() ),
+   * __kmp_max_nth) */
+  if (nth < (4 * req_nproc))
+    nth = (4 * req_nproc);
+  if (nth < (4 * __kmp_xproc))
+    nth = (4 * __kmp_xproc);
+
+  if (nth > __kmp_max_nth)
+    nth = __kmp_max_nth;
+
+  return nth;
+}
+
+int __kmp_default_tp_capacity(int req_nproc, int max_nth,
+                              int all_threads_specified) {
+  int nth = 128;
+
+  if (all_threads_specified)
+    return max_nth;
+  /* MIN( MAX (128, 4 * $OMP_NUM_THREADS, 4 * omp_get_num_procs() ),
+   * __kmp_max_nth ) */
+  if (nth < (4 * req_nproc))
+    nth = (4 * req_nproc);
+  if (nth < (4 * __kmp_xproc))
+    nth = (4 * __kmp_xproc);
+
+  if (nth > __kmp_max_nth)
+    nth = __kmp_max_nth;
+
+  return nth;
+}
+
+// -----------------------------------------------------------------------------
+// Helper print functions.
+
+static void __kmp_stg_print_bool(kmp_str_buf_t *buffer, char const *name,
+                                 int value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_BOOL;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%s\n", name, value ? "true" : "false");
+  }
+} // __kmp_stg_print_bool
+
+static void __kmp_stg_print_int(kmp_str_buf_t *buffer, char const *name,
+                                int value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_INT;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%d\n", name, value);
+  }
+} // __kmp_stg_print_int
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+static void __kmp_stg_print_uint64(kmp_str_buf_t *buffer, char const *name,
+                                   kmp_uint64 value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_UINT64;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%" KMP_UINT64_SPEC "\n", name, value);
+  }
+} // __kmp_stg_print_uint64
+#endif
+
+static void __kmp_stg_print_str(kmp_str_buf_t *buffer, char const *name,
+                                char const *value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_STR;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%s\n", name, value);
+  }
+} // __kmp_stg_print_str
+
+static void __kmp_stg_print_size(kmp_str_buf_t *buffer, char const *name,
+                                 size_t value) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+    __kmp_str_buf_print_size(buffer, value);
+    __kmp_str_buf_print(buffer, "'\n");
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=", name);
+    __kmp_str_buf_print_size(buffer, value);
+    __kmp_str_buf_print(buffer, "\n");
+    return;
+  }
+} // __kmp_stg_print_size
+
+// =============================================================================
+// Parse and print functions.
+
+// -----------------------------------------------------------------------------
+// KMP_DEVICE_THREAD_LIMIT, KMP_ALL_THREADS
+
+static void __kmp_stg_parse_device_thread_limit(char const *name,
+                                                char const *value, void *data) {
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+  if (strcmp(name, "KMP_ALL_THREADS") == 0) {
+    KMP_INFORM(EnvVarDeprecated, name, "KMP_DEVICE_THREAD_LIMIT");
+  }
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+  if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
+    __kmp_max_nth = __kmp_xproc;
+    __kmp_allThreadsSpecified = 1;
+  } else {
+    __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_max_nth);
+    __kmp_allThreadsSpecified = 0;
+  }
+  K_DIAG(1, ("__kmp_max_nth == %d\n", __kmp_max_nth));
+
+} // __kmp_stg_parse_device_thread_limit
+
+static void __kmp_stg_print_device_thread_limit(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_max_nth);
+} // __kmp_stg_print_device_thread_limit
+
+// -----------------------------------------------------------------------------
+// OMP_THREAD_LIMIT
+static void __kmp_stg_parse_thread_limit(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_cg_max_nth);
+  K_DIAG(1, ("__kmp_cg_max_nth == %d\n", __kmp_cg_max_nth));
+
+} // __kmp_stg_parse_thread_limit
+
+static void __kmp_stg_print_thread_limit(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_cg_max_nth);
+} // __kmp_stg_print_thread_limit
+
+// -----------------------------------------------------------------------------
+// KMP_TEAMS_THREAD_LIMIT
+static void __kmp_stg_parse_teams_thread_limit(char const *name,
+                                               char const *value, void *data) {
+  __kmp_stg_parse_int(name, value, 1, __kmp_sys_max_nth, &__kmp_teams_max_nth);
+} // __kmp_stg_teams_thread_limit
+
+static void __kmp_stg_print_teams_thread_limit(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_teams_max_nth);
+} // __kmp_stg_print_teams_thread_limit
+
+// -----------------------------------------------------------------------------
+// KMP_USE_YIELD
+static void __kmp_stg_parse_use_yield(char const *name, char const *value,
+                                      void *data) {
+  __kmp_stg_parse_int(name, value, 0, 2, &__kmp_use_yield);
+  __kmp_use_yield_exp_set = 1;
+} // __kmp_stg_parse_use_yield
+
+static void __kmp_stg_print_use_yield(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_use_yield);
+} // __kmp_stg_print_use_yield
+
+// -----------------------------------------------------------------------------
+// KMP_BLOCKTIME
+
+static void __kmp_stg_parse_blocktime(char const *name, char const *value,
+                                      void *data) {
+  __kmp_dflt_blocktime = __kmp_convert_to_milliseconds(value);
+  if (__kmp_dflt_blocktime < 0) {
+    __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+    __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidValue, name, value),
+              __kmp_msg_null);
+    KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime);
+    __kmp_env_blocktime = FALSE; // Revert to default as if var not set.
+  } else {
+    if (__kmp_dflt_blocktime < KMP_MIN_BLOCKTIME) {
+      __kmp_dflt_blocktime = KMP_MIN_BLOCKTIME;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(SmallValue, name, value),
+                __kmp_msg_null);
+      KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime);
+    } else if (__kmp_dflt_blocktime > KMP_MAX_BLOCKTIME) {
+      __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value),
+                __kmp_msg_null);
+      KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime);
+    }
+    __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified.
+  }
+#if KMP_USE_MONITOR
+  // calculate number of monitor thread wakeup intervals corresponding to
+  // blocktime.
+  __kmp_monitor_wakeups =
+      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+  __kmp_bt_intervals =
+      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
+#endif
+  K_DIAG(1, ("__kmp_env_blocktime == %d\n", __kmp_env_blocktime));
+  if (__kmp_env_blocktime) {
+    K_DIAG(1, ("__kmp_dflt_blocktime == %d\n", __kmp_dflt_blocktime));
+  }
+} // __kmp_stg_parse_blocktime
+
+static void __kmp_stg_print_blocktime(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_dflt_blocktime);
+} // __kmp_stg_print_blocktime
+
+// -----------------------------------------------------------------------------
+// KMP_DUPLICATE_LIB_OK
+
+static void __kmp_stg_parse_duplicate_lib_ok(char const *name,
+                                             char const *value, void *data) {
+  /* actually this variable is not supported, put here for compatibility with
+     earlier builds and for static/dynamic combination */
+  __kmp_stg_parse_bool(name, value, &__kmp_duplicate_library_ok);
+} // __kmp_stg_parse_duplicate_lib_ok
+
+static void __kmp_stg_print_duplicate_lib_ok(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_duplicate_library_ok);
+} // __kmp_stg_print_duplicate_lib_ok
+
+// -----------------------------------------------------------------------------
+// KMP_INHERIT_FP_CONTROL
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+static void __kmp_stg_parse_inherit_fp_control(char const *name,
+                                               char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_inherit_fp_control);
+} // __kmp_stg_parse_inherit_fp_control
+
+static void __kmp_stg_print_inherit_fp_control(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+#if KMP_DEBUG
+  __kmp_stg_print_bool(buffer, name, __kmp_inherit_fp_control);
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_inherit_fp_control
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+// Used for OMP_WAIT_POLICY
+static char const *blocktime_str = NULL;
+
+// -----------------------------------------------------------------------------
+// KMP_LIBRARY, OMP_WAIT_POLICY
+
+static void __kmp_stg_parse_wait_policy(char const *name, char const *value,
+                                        void *data) {
+
+  kmp_stg_wp_data_t *wait = (kmp_stg_wp_data_t *)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, wait->rivals);
+  if (rc) {
+    return;
+  }
+
+  if (wait->omp) {
+    if (__kmp_str_match("ACTIVE", 1, value)) {
+      __kmp_library = library_turnaround;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to "infinite".
+        __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+      }
+    } else if (__kmp_str_match("PASSIVE", 1, value)) {
+      __kmp_library = library_throughput;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
+    } else {
+      KMP_WARNING(StgInvalidValue, name, value);
+    }
+  } else {
+    if (__kmp_str_match("serial", 1, value)) { /* S */
+      __kmp_library = library_serial;
+    } else if (__kmp_str_match("throughput", 2, value)) { /* TH */
+      __kmp_library = library_throughput;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
+    } else if (__kmp_str_match("turnaround", 2, value)) { /* TU */
+      __kmp_library = library_turnaround;
+    } else if (__kmp_str_match("dedicated", 1, value)) { /* D */
+      __kmp_library = library_turnaround;
+    } else if (__kmp_str_match("multiuser", 1, value)) { /* M */
+      __kmp_library = library_throughput;
+      if (blocktime_str == NULL) {
+        // KMP_BLOCKTIME not specified, so set default to 0.
+        __kmp_dflt_blocktime = 0;
+      }
+    } else {
+      KMP_WARNING(StgInvalidValue, name, value);
+    }
+  }
+} // __kmp_stg_parse_wait_policy
+
+static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+
+  kmp_stg_wp_data_t *wait = (kmp_stg_wp_data_t *)data;
+  char const *value = NULL;
+
+  if (wait->omp) {
+    switch (__kmp_library) {
+    case library_turnaround: {
+      value = "ACTIVE";
+    } break;
+    case library_throughput: {
+      value = "PASSIVE";
+    } break;
+    }
+  } else {
+    switch (__kmp_library) {
+    case library_serial: {
+      value = "serial";
+    } break;
+    case library_turnaround: {
+      value = "turnaround";
+    } break;
+    case library_throughput: {
+      value = "throughput";
+    } break;
+    }
+  }
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+
+} // __kmp_stg_print_wait_policy
+
+#if KMP_USE_MONITOR
+// -----------------------------------------------------------------------------
+// KMP_MONITOR_STACKSIZE
+
+static void __kmp_stg_parse_monitor_stacksize(char const *name,
+                                              char const *value, void *data) {
+  __kmp_stg_parse_size(name, value, __kmp_sys_min_stksize, KMP_MAX_STKSIZE,
+                       NULL, &__kmp_monitor_stksize, 1);
+} // __kmp_stg_parse_monitor_stacksize
+
+static void __kmp_stg_print_monitor_stacksize(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  if (__kmp_env_format) {
+    if (__kmp_monitor_stksize > 0)
+      KMP_STR_BUF_PRINT_NAME_EX(name);
+    else
+      KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (__kmp_monitor_stksize > 0) {
+    __kmp_str_buf_print_size(buffer, __kmp_monitor_stksize);
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+  if (__kmp_env_format && __kmp_monitor_stksize) {
+    __kmp_str_buf_print(buffer, "'\n");
+  }
+} // __kmp_stg_print_monitor_stacksize
+#endif // KMP_USE_MONITOR
+
+// -----------------------------------------------------------------------------
+// KMP_SETTINGS
+
+static void __kmp_stg_parse_settings(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_settings);
+} // __kmp_stg_parse_settings
+
+static void __kmp_stg_print_settings(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_settings);
+} // __kmp_stg_print_settings
+
+// -----------------------------------------------------------------------------
+// KMP_STACKPAD
+
+static void __kmp_stg_parse_stackpad(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_int(name, // Env var name
+                      value, // Env var value
+                      KMP_MIN_STKPADDING, // Min value
+                      KMP_MAX_STKPADDING, // Max value
+                      &__kmp_stkpadding // Var to initialize
+                      );
+} // __kmp_stg_parse_stackpad
+
+static void __kmp_stg_print_stackpad(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_stkpadding);
+} // __kmp_stg_print_stackpad
+
+// -----------------------------------------------------------------------------
+// KMP_STACKOFFSET
+
+static void __kmp_stg_parse_stackoffset(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_size(name, // Env var name
+                       value, // Env var value
+                       KMP_MIN_STKOFFSET, // Min value
+                       KMP_MAX_STKOFFSET, // Max value
+                       NULL, //
+                       &__kmp_stkoffset, // Var to initialize
+                       1);
+} // __kmp_stg_parse_stackoffset
+
+static void __kmp_stg_print_stackoffset(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_size(buffer, name, __kmp_stkoffset);
+} // __kmp_stg_print_stackoffset
+
+// -----------------------------------------------------------------------------
+// KMP_STACKSIZE, OMP_STACKSIZE, GOMP_STACKSIZE
+
+static void __kmp_stg_parse_stacksize(char const *name, char const *value,
+                                      void *data) {
+
+  kmp_stg_ss_data_t *stacksize = (kmp_stg_ss_data_t *)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, stacksize->rivals);
+  if (rc) {
+    return;
+  }
+  __kmp_stg_parse_size(name, // Env var name
+                       value, // Env var value
+                       __kmp_sys_min_stksize, // Min value
+                       KMP_MAX_STKSIZE, // Max value
+                       &__kmp_env_stksize, //
+                       &__kmp_stksize, // Var to initialize
+                       stacksize->factor);
+
+} // __kmp_stg_parse_stacksize
+
+// This function is called for printing both KMP_STACKSIZE (factor is 1) and
+// OMP_STACKSIZE (factor is 1024). Currently it is not possible to print
+// OMP_STACKSIZE value in bytes. We can consider adding this possibility by a
+// customer request in future.
+static void __kmp_stg_print_stacksize(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  kmp_stg_ss_data_t *stacksize = (kmp_stg_ss_data_t *)data;
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+    __kmp_str_buf_print_size(buffer, (__kmp_stksize % 1024)
+                                         ? __kmp_stksize / stacksize->factor
+                                         : __kmp_stksize);
+    __kmp_str_buf_print(buffer, "'\n");
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=", name);
+    __kmp_str_buf_print_size(buffer, (__kmp_stksize % 1024)
+                                         ? __kmp_stksize / stacksize->factor
+                                         : __kmp_stksize);
+    __kmp_str_buf_print(buffer, "\n");
+  }
+} // __kmp_stg_print_stacksize
+
+// -----------------------------------------------------------------------------
+// KMP_VERSION
+
+static void __kmp_stg_parse_version(char const *name, char const *value,
+                                    void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_version);
+} // __kmp_stg_parse_version
+
+static void __kmp_stg_print_version(kmp_str_buf_t *buffer, char const *name,
+                                    void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_version);
+} // __kmp_stg_print_version
+
+// -----------------------------------------------------------------------------
+// KMP_WARNINGS
+
+static void __kmp_stg_parse_warnings(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_generate_warnings);
+  if (__kmp_generate_warnings != kmp_warnings_off) {
+    // AC: only 0/1 values documented, so reset to explicit to distinguish from
+    // default setting
+    __kmp_generate_warnings = kmp_warnings_explicit;
+  }
+} // __kmp_stg_parse_warnings
+
+static void __kmp_stg_print_warnings(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  // AC: TODO: change to print_int? (needs documentation change)
+  __kmp_stg_print_bool(buffer, name, __kmp_generate_warnings);
+} // __kmp_stg_print_warnings
+
+// -----------------------------------------------------------------------------
+// OMP_NESTED, OMP_NUM_THREADS
+
+static void __kmp_stg_parse_nested(char const *name, char const *value,
+                                   void *data) {
+  int nested;
+  KMP_INFORM(EnvVarDeprecated, name, "OMP_MAX_ACTIVE_LEVELS");
+  __kmp_stg_parse_bool(name, value, &nested);
+  if (nested) {
+    if (!__kmp_dflt_max_active_levels_set)
+      __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  } else { // nesting explicitly turned off
+    __kmp_dflt_max_active_levels = 1;
+    __kmp_dflt_max_active_levels_set = true;
+  }
+} // __kmp_stg_parse_nested
+
+static void __kmp_stg_print_nested(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  __kmp_str_buf_print(buffer, ": deprecated; max-active-levels-var=%d\n",
+                      __kmp_dflt_max_active_levels);
+} // __kmp_stg_print_nested
+
+static void __kmp_parse_nested_num_threads(const char *var, const char *env,
+                                           kmp_nested_nthreads_t *nth_array) {
+  const char *next = env;
+  const char *scan = next;
+
+  int total = 0; // Count elements that were set. It'll be used as an array size
+  int prev_comma = FALSE; // For correct processing sequential commas
+
+  // Count the number of values in the env. var string
+  for (;;) {
+    SKIP_WS(next);
+
+    if (*next == '\0') {
+      break;
+    }
+    // Next character is not an integer or not a comma => end of list
+    if (((*next < '0') || (*next > '9')) && (*next != ',')) {
+      KMP_WARNING(NthSyntaxError, var, env);
+      return;
+    }
+    // The next character is ','
+    if (*next == ',') {
+      // ',' is the fisrt character
+      if (total == 0 || prev_comma) {
+        total++;
+      }
+      prev_comma = TRUE;
+      next++; // skip ','
+      SKIP_WS(next);
+    }
+    // Next character is a digit
+    if (*next >= '0' && *next <= '9') {
+      prev_comma = FALSE;
+      SKIP_DIGITS(next);
+      total++;
+      const char *tmp = next;
+      SKIP_WS(tmp);
+      if ((*next == ' ' || *next == '\t') && (*tmp >= '0' && *tmp <= '9')) {
+        KMP_WARNING(NthSpacesNotAllowed, var, env);
+        return;
+      }
+    }
+  }
+  if (!__kmp_dflt_max_active_levels_set && total > 1)
+    __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+  KMP_DEBUG_ASSERT(total > 0);
+  if (total <= 0) {
+    KMP_WARNING(NthSyntaxError, var, env);
+    return;
+  }
+
+  // Check if the nested nthreads array exists
+  if (!nth_array->nth) {
+    // Allocate an array of double size
+    nth_array->nth = (int *)KMP_INTERNAL_MALLOC(sizeof(int) * total * 2);
+    if (nth_array->nth == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    nth_array->size = total * 2;
+  } else {
+    if (nth_array->size < total) {
+      // Increase the array size
+      do {
+        nth_array->size *= 2;
+      } while (nth_array->size < total);
+
+      nth_array->nth = (int *)KMP_INTERNAL_REALLOC(
+          nth_array->nth, sizeof(int) * nth_array->size);
+      if (nth_array->nth == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+    }
+  }
+  nth_array->used = total;
+  int i = 0;
+
+  prev_comma = FALSE;
+  total = 0;
+  // Save values in the array
+  for (;;) {
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    // The next character is ','
+    if (*scan == ',') {
+      // ',' in the beginning of the list
+      if (total == 0) {
+        // The value is supposed to be equal to __kmp_avail_proc but it is
+        // unknown at the moment.
+        // So let's put a placeholder (#threads = 0) to correct it later.
+        nth_array->nth[i++] = 0;
+        total++;
+      } else if (prev_comma) {
+        // Num threads is inherited from the previous level
+        nth_array->nth[i] = nth_array->nth[i - 1];
+        i++;
+        total++;
+      }
+      prev_comma = TRUE;
+      scan++; // skip ','
+      SKIP_WS(scan);
+    }
+    // Next character is a digit
+    if (*scan >= '0' && *scan <= '9') {
+      int num;
+      const char *buf = scan;
+      char const *msg = NULL;
+      prev_comma = FALSE;
+      SKIP_DIGITS(scan);
+      total++;
+
+      num = __kmp_str_to_int(buf, *scan);
+      if (num < KMP_MIN_NTH) {
+        msg = KMP_I18N_STR(ValueTooSmall);
+        num = KMP_MIN_NTH;
+      } else if (num > __kmp_sys_max_nth) {
+        msg = KMP_I18N_STR(ValueTooLarge);
+        num = __kmp_sys_max_nth;
+      }
+      if (msg != NULL) {
+        // Message is not empty. Print warning.
+        KMP_WARNING(ParseSizeIntWarn, var, env, msg);
+        KMP_INFORM(Using_int_Value, var, num);
+      }
+      nth_array->nth[i++] = num;
+    }
+  }
+}
+
+static void __kmp_stg_parse_num_threads(char const *name, char const *value,
+                                        void *data) {
+  // TODO: Remove this option. OMP_NUM_THREADS is a list of positive integers!
+  if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
+    // The array of 1 element
+    __kmp_nested_nth.nth = (int *)KMP_INTERNAL_MALLOC(sizeof(int));
+    __kmp_nested_nth.size = __kmp_nested_nth.used = 1;
+    __kmp_nested_nth.nth[0] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
+        __kmp_xproc;
+  } else {
+    __kmp_parse_nested_num_threads(name, value, &__kmp_nested_nth);
+    if (__kmp_nested_nth.nth) {
+      __kmp_dflt_team_nth = __kmp_nested_nth.nth[0];
+      if (__kmp_dflt_team_nth_ub < __kmp_dflt_team_nth) {
+        __kmp_dflt_team_nth_ub = __kmp_dflt_team_nth;
+      }
+    }
+  }
+  K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth));
+} // __kmp_stg_parse_num_threads
+
+static void __kmp_stg_print_num_threads(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (__kmp_nested_nth.used) {
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    for (int i = 0; i < __kmp_nested_nth.used; i++) {
+      __kmp_str_buf_print(&buf, "%d", __kmp_nested_nth.nth[i]);
+      if (i < __kmp_nested_nth.used - 1) {
+        __kmp_str_buf_print(&buf, ",");
+      }
+    }
+    __kmp_str_buf_print(buffer, "='%s'\n", buf.str);
+    __kmp_str_buf_free(&buf);
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+} // __kmp_stg_print_num_threads
+
+// -----------------------------------------------------------------------------
+// OpenMP 3.0: KMP_TASKING, OMP_MAX_ACTIVE_LEVELS,
+
+static void __kmp_stg_parse_tasking(char const *name, char const *value,
+                                    void *data) {
+  __kmp_stg_parse_int(name, value, 0, (int)tskm_max,
+                      (int *)&__kmp_tasking_mode);
+} // __kmp_stg_parse_tasking
+
+static void __kmp_stg_print_tasking(kmp_str_buf_t *buffer, char const *name,
+                                    void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_tasking_mode);
+} // __kmp_stg_print_tasking
+
+static void __kmp_stg_parse_task_stealing(char const *name, char const *value,
+                                          void *data) {
+  __kmp_stg_parse_int(name, value, 0, 1,
+                      (int *)&__kmp_task_stealing_constraint);
+} // __kmp_stg_parse_task_stealing
+
+static void __kmp_stg_print_task_stealing(kmp_str_buf_t *buffer,
+                                          char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_task_stealing_constraint);
+} // __kmp_stg_print_task_stealing
+
+static void __kmp_stg_parse_max_active_levels(char const *name,
+                                              char const *value, void *data) {
+  kmp_uint64 tmp_dflt = 0;
+  char const *msg = NULL;
+  if (!__kmp_dflt_max_active_levels_set) {
+    // Don't overwrite __kmp_dflt_max_active_levels if we get an invalid setting
+    __kmp_str_to_uint(value, &tmp_dflt, &msg);
+    if (msg != NULL) { // invalid setting; print warning and ignore
+      KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+    } else if (tmp_dflt > KMP_MAX_ACTIVE_LEVELS_LIMIT) {
+      // invalid setting; print warning and ignore
+      msg = KMP_I18N_STR(ValueTooLarge);
+      KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+    } else { // valid setting
+      __kmp_dflt_max_active_levels = tmp_dflt;
+      __kmp_dflt_max_active_levels_set = true;
+    }
+  }
+} // __kmp_stg_parse_max_active_levels
+
+static void __kmp_stg_print_max_active_levels(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_dflt_max_active_levels);
+} // __kmp_stg_print_max_active_levels
+
+// -----------------------------------------------------------------------------
+// OpenMP 4.0: OMP_DEFAULT_DEVICE
+static void __kmp_stg_parse_default_device(char const *name, char const *value,
+                                           void *data) {
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_DEFAULT_DEVICE_LIMIT,
+                      &__kmp_default_device);
+} // __kmp_stg_parse_default_device
+
+static void __kmp_stg_print_default_device(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_default_device);
+} // __kmp_stg_print_default_device
+
+// -----------------------------------------------------------------------------
+// OpenMP 5.0: OMP_TARGET_OFFLOAD
+static void __kmp_stg_parse_target_offload(char const *name, char const *value,
+                                           void *data) {
+  const char *next = value;
+  const char *scan = next;
+
+  __kmp_target_offload = tgt_default;
+  SKIP_WS(next);
+  if (*next == '\0')
+    return;
+  scan = next;
+  if (!__kmp_strcasecmp_with_sentinel("mandatory", scan, 0)) {
+    __kmp_target_offload = tgt_mandatory;
+  } else if (!__kmp_strcasecmp_with_sentinel("disabled", scan, 0)) {
+    __kmp_target_offload = tgt_disabled;
+  } else if (!__kmp_strcasecmp_with_sentinel("default", scan, 0)) {
+    __kmp_target_offload = tgt_default;
+  } else {
+    KMP_WARNING(SyntaxErrorUsing, name, "DEFAULT");
+  }
+
+} // __kmp_stg_parse_target_offload
+
+static void __kmp_stg_print_target_offload(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  const char *value = NULL;
+  if (__kmp_target_offload == tgt_default)
+    value = "DEFAULT";
+  else if (__kmp_target_offload == tgt_mandatory)
+    value = "MANDATORY";
+  else if (__kmp_target_offload == tgt_disabled)
+    value = "DISABLED";
+  KMP_DEBUG_ASSERT(value);
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  __kmp_str_buf_print(buffer, "=%s\n", value);
+} // __kmp_stg_print_target_offload
+
+// -----------------------------------------------------------------------------
+// OpenMP 4.5: OMP_MAX_TASK_PRIORITY
+static void __kmp_stg_parse_max_task_priority(char const *name,
+                                              char const *value, void *data) {
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_TASK_PRIORITY_LIMIT,
+                      &__kmp_max_task_priority);
+} // __kmp_stg_parse_max_task_priority
+
+static void __kmp_stg_print_max_task_priority(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_max_task_priority);
+} // __kmp_stg_print_max_task_priority
+
+// KMP_TASKLOOP_MIN_TASKS
+// taskloop threashold to switch from recursive to linear tasks creation
+static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
+                                               char const *value, void *data) {
+  int tmp;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &tmp);
+  __kmp_taskloop_min_tasks = tmp;
+} // __kmp_stg_parse_taskloop_min_tasks
+
+static void __kmp_stg_print_taskloop_min_tasks(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_taskloop_min_tasks);
+} // __kmp_stg_print_taskloop_min_tasks
+
+// -----------------------------------------------------------------------------
+// KMP_DISP_NUM_BUFFERS
+static void __kmp_stg_parse_disp_buffers(char const *name, char const *value,
+                                         void *data) {
+  if (TCR_4(__kmp_init_serial)) {
+    KMP_WARNING(EnvSerialWarn, name);
+    return;
+  } // read value before serial initialization only
+  __kmp_stg_parse_int(name, value, 1, KMP_MAX_NTH, &__kmp_dispatch_num_buffers);
+} // __kmp_stg_parse_disp_buffers
+
+static void __kmp_stg_print_disp_buffers(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_dispatch_num_buffers);
+} // __kmp_stg_print_disp_buffers
+
+#if KMP_NESTED_HOT_TEAMS
+// -----------------------------------------------------------------------------
+// KMP_HOT_TEAMS_MAX_LEVEL, KMP_HOT_TEAMS_MODE
+
+static void __kmp_stg_parse_hot_teams_level(char const *name, char const *value,
+                                            void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    return;
+  } // read value before first parallel only
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT,
+                      &__kmp_hot_teams_max_level);
+} // __kmp_stg_parse_hot_teams_level
+
+static void __kmp_stg_print_hot_teams_level(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_hot_teams_max_level);
+} // __kmp_stg_print_hot_teams_level
+
+static void __kmp_stg_parse_hot_teams_mode(char const *name, char const *value,
+                                           void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    return;
+  } // read value before first parallel only
+  __kmp_stg_parse_int(name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT,
+                      &__kmp_hot_teams_mode);
+} // __kmp_stg_parse_hot_teams_mode
+
+static void __kmp_stg_print_hot_teams_mode(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_hot_teams_mode);
+} // __kmp_stg_print_hot_teams_mode
+
+#endif // KMP_NESTED_HOT_TEAMS
+
+// -----------------------------------------------------------------------------
+// KMP_HANDLE_SIGNALS
+
+#if KMP_HANDLE_SIGNALS
+
+static void __kmp_stg_parse_handle_signals(char const *name, char const *value,
+                                           void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_handle_signals);
+} // __kmp_stg_parse_handle_signals
+
+static void __kmp_stg_print_handle_signals(kmp_str_buf_t *buffer,
+                                           char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_handle_signals);
+} // __kmp_stg_print_handle_signals
+
+#endif // KMP_HANDLE_SIGNALS
+
+// -----------------------------------------------------------------------------
+// KMP_X_DEBUG, KMP_DEBUG, KMP_DEBUG_BUF_*, KMP_DIAG
+
+#ifdef KMP_DEBUG
+
+#define KMP_STG_X_DEBUG(x)                                                     \
+  static void __kmp_stg_parse_##x##_debug(char const *name, char const *value, \
+                                          void *data) {                        \
+    __kmp_stg_parse_int(name, value, 0, INT_MAX, &kmp_##x##_debug);            \
+  } /* __kmp_stg_parse_x_debug */                                              \
+  static void __kmp_stg_print_##x##_debug(kmp_str_buf_t *buffer,               \
+                                          char const *name, void *data) {      \
+    __kmp_stg_print_int(buffer, name, kmp_##x##_debug);                        \
+  } /* __kmp_stg_print_x_debug */
+
+KMP_STG_X_DEBUG(a)
+KMP_STG_X_DEBUG(b)
+KMP_STG_X_DEBUG(c)
+KMP_STG_X_DEBUG(d)
+KMP_STG_X_DEBUG(e)
+KMP_STG_X_DEBUG(f)
+
+#undef KMP_STG_X_DEBUG
+
+static void __kmp_stg_parse_debug(char const *name, char const *value,
+                                  void *data) {
+  int debug = 0;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &debug);
+  if (kmp_a_debug < debug) {
+    kmp_a_debug = debug;
+  }
+  if (kmp_b_debug < debug) {
+    kmp_b_debug = debug;
+  }
+  if (kmp_c_debug < debug) {
+    kmp_c_debug = debug;
+  }
+  if (kmp_d_debug < debug) {
+    kmp_d_debug = debug;
+  }
+  if (kmp_e_debug < debug) {
+    kmp_e_debug = debug;
+  }
+  if (kmp_f_debug < debug) {
+    kmp_f_debug = debug;
+  }
+} // __kmp_stg_parse_debug
+
+static void __kmp_stg_parse_debug_buf(char const *name, char const *value,
+                                      void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_debug_buf);
+  // !!! TODO: Move buffer initialization of of this file! It may works
+  // incorrectly if KMP_DEBUG_BUF is parsed before KMP_DEBUG_BUF_LINES or
+  // KMP_DEBUG_BUF_CHARS.
+  if (__kmp_debug_buf) {
+    int i;
+    int elements = __kmp_debug_buf_lines * __kmp_debug_buf_chars;
+
+    /* allocate and initialize all entries in debug buffer to empty */
+    __kmp_debug_buffer = (char *)__kmp_page_allocate(elements * sizeof(char));
+    for (i = 0; i < elements; i += __kmp_debug_buf_chars)
+      __kmp_debug_buffer[i] = '\0';
+
+    __kmp_debug_count = 0;
+  }
+  K_DIAG(1, ("__kmp_debug_buf = %d\n", __kmp_debug_buf));
+} // __kmp_stg_parse_debug_buf
+
+static void __kmp_stg_print_debug_buf(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_debug_buf);
+} // __kmp_stg_print_debug_buf
+
+static void __kmp_stg_parse_debug_buf_atomic(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_debug_buf_atomic);
+} // __kmp_stg_parse_debug_buf_atomic
+
+static void __kmp_stg_print_debug_buf_atomic(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_debug_buf_atomic);
+} // __kmp_stg_print_debug_buf_atomic
+
+static void __kmp_stg_parse_debug_buf_chars(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_int(name, value, KMP_DEBUG_BUF_CHARS_MIN, INT_MAX,
+                      &__kmp_debug_buf_chars);
+} // __kmp_stg_debug_parse_buf_chars
+
+static void __kmp_stg_print_debug_buf_chars(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_debug_buf_chars);
+} // __kmp_stg_print_debug_buf_chars
+
+static void __kmp_stg_parse_debug_buf_lines(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_int(name, value, KMP_DEBUG_BUF_LINES_MIN, INT_MAX,
+                      &__kmp_debug_buf_lines);
+} // __kmp_stg_parse_debug_buf_lines
+
+static void __kmp_stg_print_debug_buf_lines(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_debug_buf_lines);
+} // __kmp_stg_print_debug_buf_lines
+
+static void __kmp_stg_parse_diag(char const *name, char const *value,
+                                 void *data) {
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &kmp_diag);
+} // __kmp_stg_parse_diag
+
+static void __kmp_stg_print_diag(kmp_str_buf_t *buffer, char const *name,
+                                 void *data) {
+  __kmp_stg_print_int(buffer, name, kmp_diag);
+} // __kmp_stg_print_diag
+
+#endif // KMP_DEBUG
+
+// -----------------------------------------------------------------------------
+// KMP_ALIGN_ALLOC
+
+static void __kmp_stg_parse_align_alloc(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_size(name, value, CACHE_LINE, INT_MAX, NULL,
+                       &__kmp_align_alloc, 1);
+} // __kmp_stg_parse_align_alloc
+
+static void __kmp_stg_print_align_alloc(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_size(buffer, name, __kmp_align_alloc);
+} // __kmp_stg_print_align_alloc
+
+// -----------------------------------------------------------------------------
+// KMP_PLAIN_BARRIER, KMP_FORKJOIN_BARRIER, KMP_REDUCTION_BARRIER
+
+// TODO: Remove __kmp_barrier_branch_bit_env_name varibale, remove loops from
+// parse and print functions, pass required info through data argument.
+
+static void __kmp_stg_parse_barrier_branch_bit(char const *name,
+                                               char const *value, void *data) {
+  const char *var;
+
+  /* ---------- Barrier branch bit control ------------ */
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_branch_bit_env_name[i];
+    if ((strcmp(var, name) == 0) && (value != 0)) {
+      char *comma;
+
+      comma = CCAST(char *, strchr(value, ','));
+      __kmp_barrier_gather_branch_bits[i] =
+          (kmp_uint32)__kmp_str_to_int(value, ',');
+      /* is there a specified release parameter? */
+      if (comma == NULL) {
+        __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
+      } else {
+        __kmp_barrier_release_branch_bits[i] =
+            (kmp_uint32)__kmp_str_to_int(comma + 1, 0);
+
+        if (__kmp_barrier_release_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(BarrReleaseValueInvalid, name, comma + 1),
+                    __kmp_msg_null);
+          __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
+        }
+      }
+      if (__kmp_barrier_gather_branch_bits[i] > KMP_MAX_BRANCH_BITS) {
+        KMP_WARNING(BarrGatherValueInvalid, name, value);
+        KMP_INFORM(Using_uint_Value, name, __kmp_barrier_gather_bb_dflt);
+        __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
+      }
+    }
+    K_DIAG(1, ("%s == %d,%d\n", __kmp_barrier_branch_bit_env_name[i],
+               __kmp_barrier_gather_branch_bits[i],
+               __kmp_barrier_release_branch_bits[i]))
+  }
+} // __kmp_stg_parse_barrier_branch_bit
+
+static void __kmp_stg_print_barrier_branch_bit(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  const char *var;
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_branch_bit_env_name[i];
+    if (strcmp(var, name) == 0) {
+      if (__kmp_env_format) {
+        KMP_STR_BUF_PRINT_NAME_EX(__kmp_barrier_branch_bit_env_name[i]);
+      } else {
+        __kmp_str_buf_print(buffer, "   %s='",
+                            __kmp_barrier_branch_bit_env_name[i]);
+      }
+      __kmp_str_buf_print(buffer, "%d,%d'\n",
+                          __kmp_barrier_gather_branch_bits[i],
+                          __kmp_barrier_release_branch_bits[i]);
+    }
+  }
+} // __kmp_stg_print_barrier_branch_bit
+
+// ----------------------------------------------------------------------------
+// KMP_PLAIN_BARRIER_PATTERN, KMP_FORKJOIN_BARRIER_PATTERN,
+// KMP_REDUCTION_BARRIER_PATTERN
+
+// TODO: Remove __kmp_barrier_pattern_name variable, remove loops from parse and
+// print functions, pass required data to functions through data argument.
+
+static void __kmp_stg_parse_barrier_pattern(char const *name, char const *value,
+                                            void *data) {
+  const char *var;
+  /* ---------- Barrier method control ------------ */
+
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_pattern_env_name[i];
+
+    if ((strcmp(var, name) == 0) && (value != 0)) {
+      int j;
+      char *comma = CCAST(char *, strchr(value, ','));
+
+      /* handle first parameter: gather pattern */
+      for (j = bp_linear_bar; j < bp_last_bar; j++) {
+        if (__kmp_match_with_sentinel(__kmp_barrier_pattern_name[j], value, 1,
+                                      ',')) {
+          __kmp_barrier_gather_pattern[i] = (kmp_bar_pat_e)j;
+          break;
+        }
+      }
+      if (j == bp_last_bar) {
+        KMP_WARNING(BarrGatherValueInvalid, name, value);
+        KMP_INFORM(Using_str_Value, name,
+                   __kmp_barrier_pattern_name[bp_linear_bar]);
+      }
+
+      /* handle second parameter: release pattern */
+      if (comma != NULL) {
+        for (j = bp_linear_bar; j < bp_last_bar; j++) {
+          if (__kmp_str_match(__kmp_barrier_pattern_name[j], 1, comma + 1)) {
+            __kmp_barrier_release_pattern[i] = (kmp_bar_pat_e)j;
+            break;
+          }
+        }
+        if (j == bp_last_bar) {
+          __kmp_msg(kmp_ms_warning,
+                    KMP_MSG(BarrReleaseValueInvalid, name, comma + 1),
+                    __kmp_msg_null);
+          KMP_INFORM(Using_str_Value, name,
+                     __kmp_barrier_pattern_name[bp_linear_bar]);
+        }
+      }
+    }
+  }
+} // __kmp_stg_parse_barrier_pattern
+
+static void __kmp_stg_print_barrier_pattern(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  const char *var;
+  for (int i = bs_plain_barrier; i < bs_last_barrier; i++) {
+    var = __kmp_barrier_pattern_env_name[i];
+    if (strcmp(var, name) == 0) {
+      int j = __kmp_barrier_gather_pattern[i];
+      int k = __kmp_barrier_release_pattern[i];
+      if (__kmp_env_format) {
+        KMP_STR_BUF_PRINT_NAME_EX(__kmp_barrier_pattern_env_name[i]);
+      } else {
+        __kmp_str_buf_print(buffer, "   %s='",
+                            __kmp_barrier_pattern_env_name[i]);
+      }
+      __kmp_str_buf_print(buffer, "%s,%s'\n", __kmp_barrier_pattern_name[j],
+                          __kmp_barrier_pattern_name[k]);
+    }
+  }
+} // __kmp_stg_print_barrier_pattern
+
+// -----------------------------------------------------------------------------
+// KMP_ABORT_DELAY
+
+static void __kmp_stg_parse_abort_delay(char const *name, char const *value,
+                                        void *data) {
+  // Units of KMP_DELAY_ABORT are seconds, units of __kmp_abort_delay is
+  // milliseconds.
+  int delay = __kmp_abort_delay / 1000;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX / 1000, &delay);
+  __kmp_abort_delay = delay * 1000;
+} // __kmp_stg_parse_abort_delay
+
+static void __kmp_stg_print_abort_delay(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_abort_delay);
+} // __kmp_stg_print_abort_delay
+
+// -----------------------------------------------------------------------------
+// KMP_CPUINFO_FILE
+
+static void __kmp_stg_parse_cpuinfo_file(char const *name, char const *value,
+                                         void *data) {
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_stg_parse_str(name, value, &__kmp_cpuinfo_file);
+  K_DIAG(1, ("__kmp_cpuinfo_file == %s\n", __kmp_cpuinfo_file));
+#endif
+} //__kmp_stg_parse_cpuinfo_file
+
+static void __kmp_stg_print_cpuinfo_file(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+#if KMP_AFFINITY_SUPPORTED
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (__kmp_cpuinfo_file) {
+    __kmp_str_buf_print(buffer, "='%s'\n", __kmp_cpuinfo_file);
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+#endif
+} //__kmp_stg_print_cpuinfo_file
+
+// -----------------------------------------------------------------------------
+// KMP_FORCE_REDUCTION, KMP_DETERMINISTIC_REDUCTION
+
+static void __kmp_stg_parse_force_reduction(char const *name, char const *value,
+                                            void *data) {
+  kmp_stg_fr_data_t *reduction = (kmp_stg_fr_data_t *)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, reduction->rivals);
+  if (rc) {
+    return;
+  }
+  if (reduction->force) {
+    if (value != 0) {
+      if (__kmp_str_match("critical", 0, value))
+        __kmp_force_reduction_method = critical_reduce_block;
+      else if (__kmp_str_match("atomic", 0, value))
+        __kmp_force_reduction_method = atomic_reduce_block;
+      else if (__kmp_str_match("tree", 0, value))
+        __kmp_force_reduction_method = tree_reduce_block;
+      else {
+        KMP_FATAL(UnknownForceReduction, name, value);
+      }
+    }
+  } else {
+    __kmp_stg_parse_bool(name, value, &__kmp_determ_red);
+    if (__kmp_determ_red) {
+      __kmp_force_reduction_method = tree_reduce_block;
+    } else {
+      __kmp_force_reduction_method = reduction_method_not_defined;
+    }
+  }
+  K_DIAG(1, ("__kmp_force_reduction_method == %d\n",
+             __kmp_force_reduction_method));
+} // __kmp_stg_parse_force_reduction
+
+static void __kmp_stg_print_force_reduction(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+
+  kmp_stg_fr_data_t *reduction = (kmp_stg_fr_data_t *)data;
+  if (reduction->force) {
+    if (__kmp_force_reduction_method == critical_reduce_block) {
+      __kmp_stg_print_str(buffer, name, "critical");
+    } else if (__kmp_force_reduction_method == atomic_reduce_block) {
+      __kmp_stg_print_str(buffer, name, "atomic");
+    } else if (__kmp_force_reduction_method == tree_reduce_block) {
+      __kmp_stg_print_str(buffer, name, "tree");
+    } else {
+      if (__kmp_env_format) {
+        KMP_STR_BUF_PRINT_NAME;
+      } else {
+        __kmp_str_buf_print(buffer, "   %s", name);
+      }
+      __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+    }
+  } else {
+    __kmp_stg_print_bool(buffer, name, __kmp_determ_red);
+  }
+
+} // __kmp_stg_print_force_reduction
+
+// -----------------------------------------------------------------------------
+// KMP_STORAGE_MAP
+
+static void __kmp_stg_parse_storage_map(char const *name, char const *value,
+                                        void *data) {
+  if (__kmp_str_match("verbose", 1, value)) {
+    __kmp_storage_map = TRUE;
+    __kmp_storage_map_verbose = TRUE;
+    __kmp_storage_map_verbose_specified = TRUE;
+
+  } else {
+    __kmp_storage_map_verbose = FALSE;
+    __kmp_stg_parse_bool(name, value, &__kmp_storage_map); // !!!
+  }
+} // __kmp_stg_parse_storage_map
+
+static void __kmp_stg_print_storage_map(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  if (__kmp_storage_map_verbose || __kmp_storage_map_verbose_specified) {
+    __kmp_stg_print_str(buffer, name, "verbose");
+  } else {
+    __kmp_stg_print_bool(buffer, name, __kmp_storage_map);
+  }
+} // __kmp_stg_print_storage_map
+
+// -----------------------------------------------------------------------------
+// KMP_ALL_THREADPRIVATE
+
+static void __kmp_stg_parse_all_threadprivate(char const *name,
+                                              char const *value, void *data) {
+  __kmp_stg_parse_int(name, value,
+                      __kmp_allThreadsSpecified ? __kmp_max_nth : 1,
+                      __kmp_max_nth, &__kmp_tp_capacity);
+} // __kmp_stg_parse_all_threadprivate
+
+static void __kmp_stg_print_all_threadprivate(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_tp_capacity);
+}
+
+// -----------------------------------------------------------------------------
+// KMP_FOREIGN_THREADS_THREADPRIVATE
+
+static void __kmp_stg_parse_foreign_threads_threadprivate(char const *name,
+                                                          char const *value,
+                                                          void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_foreign_tp);
+} // __kmp_stg_parse_foreign_threads_threadprivate
+
+static void __kmp_stg_print_foreign_threads_threadprivate(kmp_str_buf_t *buffer,
+                                                          char const *name,
+                                                          void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_foreign_tp);
+} // __kmp_stg_print_foreign_threads_threadprivate
+
+// -----------------------------------------------------------------------------
+// KMP_AFFINITY, GOMP_CPU_AFFINITY, KMP_TOPOLOGY_METHOD
+
+#if KMP_AFFINITY_SUPPORTED
+// Parse the proc id list.  Return TRUE if successful, FALSE otherwise.
+static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env,
+                                             const char **nextEnv,
+                                             char **proclist) {
+  const char *scan = env;
+  const char *next = scan;
+  int empty = TRUE;
+
+  *proclist = NULL;
+
+  for (;;) {
+    int start, end, stride;
+
+    SKIP_WS(scan);
+    next = scan;
+    if (*next == '\0') {
+      break;
+    }
+
+    if (*next == '{') {
+      int num;
+      next++; // skip '{'
+      SKIP_WS(next);
+      scan = next;
+
+      // Read the first integer in the set.
+      if ((*next < '0') || (*next > '9')) {
+        KMP_WARNING(AffSyntaxError, var);
+        return FALSE;
+      }
+      SKIP_DIGITS(next);
+      num = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT(num >= 0);
+
+      for (;;) {
+        // Check for end of set.
+        SKIP_WS(next);
+        if (*next == '}') {
+          next++; // skip '}'
+          break;
+        }
+
+        // Skip optional comma.
+        if (*next == ',') {
+          next++;
+        }
+        SKIP_WS(next);
+
+        // Read the next integer in the set.
+        scan = next;
+        if ((*next < '0') || (*next > '9')) {
+          KMP_WARNING(AffSyntaxError, var);
+          return FALSE;
+        }
+
+        SKIP_DIGITS(next);
+        num = __kmp_str_to_int(scan, *next);
+        KMP_ASSERT(num >= 0);
+      }
+      empty = FALSE;
+
+      SKIP_WS(next);
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // Next character is not an integer => end of list
+    if ((*next < '0') || (*next > '9')) {
+      if (empty) {
+        KMP_WARNING(AffSyntaxError, var);
+        return FALSE;
+      }
+      break;
+    }
+
+    // Read the first integer.
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(start >= 0);
+    SKIP_WS(next);
+
+    // If this isn't a range, then go on.
+    if (*next != '-') {
+      empty = FALSE;
+
+      // Skip optional comma.
+      if (*next == ',') {
+        next++;
+      }
+      scan = next;
+      continue;
+    }
+
+    // This is a range.  Skip over the '-' and read in the 2nd int.
+    next++; // skip '-'
+    SKIP_WS(next);
+    scan = next;
+    if ((*next < '0') || (*next > '9')) {
+      KMP_WARNING(AffSyntaxError, var);
+      return FALSE;
+    }
+    SKIP_DIGITS(next);
+    end = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(end >= 0);
+
+    // Check for a stride parameter
+    stride = 1;
+    SKIP_WS(next);
+    if (*next == ':') {
+      // A stride is specified.  Skip over the ':" and read the 3rd int.
+      int sign = +1;
+      next++; // skip ':'
+      SKIP_WS(next);
+      scan = next;
+      if (*next == '-') {
+        sign = -1;
+        next++;
+        SKIP_WS(next);
+        scan = next;
+      }
+      if ((*next < '0') || (*next > '9')) {
+        KMP_WARNING(AffSyntaxError, var);
+        return FALSE;
+      }
+      SKIP_DIGITS(next);
+      stride = __kmp_str_to_int(scan, *next);
+      KMP_ASSERT(stride >= 0);
+      stride *= sign;
+    }
+
+    // Do some range checks.
+    if (stride == 0) {
+      KMP_WARNING(AffZeroStride, var);
+      return FALSE;
+    }
+    if (stride > 0) {
+      if (start > end) {
+        KMP_WARNING(AffStartGreaterEnd, var, start, end);
+        return FALSE;
+      }
+    } else {
+      if (start < end) {
+        KMP_WARNING(AffStrideLessZero, var, start, end);
+        return FALSE;
+      }
+    }
+    if ((end - start) / stride > 65536) {
+      KMP_WARNING(AffRangeTooBig, var, end, start, stride);
+      return FALSE;
+    }
+
+    empty = FALSE;
+
+    // Skip optional comma.
+    SKIP_WS(next);
+    if (*next == ',') {
+      next++;
+    }
+    scan = next;
+  }
+
+  *nextEnv = next;
+
+  {
+    int len = next - env;
+    char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+    KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
+    retlist[len] = '\0';
+    *proclist = retlist;
+  }
+  return TRUE;
+}
+
+// If KMP_AFFINITY is specified without a type, then
+// __kmp_affinity_notype should point to its setting.
+static kmp_setting_t *__kmp_affinity_notype = NULL;
+
+static void __kmp_parse_affinity_env(char const *name, char const *value,
+                                     enum affinity_type *out_type,
+                                     char **out_proclist, int *out_verbose,
+                                     int *out_warn, int *out_respect,
+                                     enum affinity_gran *out_gran,
+                                     int *out_gran_levels, int *out_dups,
+                                     int *out_compact, int *out_offset) {
+  char *buffer = NULL; // Copy of env var value.
+  char *buf = NULL; // Buffer for strtok_r() function.
+  char *next = NULL; // end of token / start of next.
+  const char *start; // start of current token (for err msgs)
+  int count = 0; // Counter of parsed integer numbers.
+  int number[2]; // Parsed numbers.
+
+  // Guards.
+  int type = 0;
+  int proclist = 0;
+  int verbose = 0;
+  int warnings = 0;
+  int respect = 0;
+  int gran = 0;
+  int dups = 0;
+
+  KMP_ASSERT(value != NULL);
+
+  if (TCR_4(__kmp_init_middle)) {
+    KMP_WARNING(EnvMiddleWarn, name);
+    __kmp_env_toPrint(name, 0);
+    return;
+  }
+  __kmp_env_toPrint(name, 1);
+
+  buffer =
+      __kmp_str_format("%s", value); // Copy env var to keep original intact.
+  buf = buffer;
+  SKIP_WS(buf);
+
+// Helper macros.
+
+// If we see a parse error, emit a warning and scan to the next ",".
+//
+// FIXME - there's got to be a better way to print an error
+// message, hopefully without overwritting peices of buf.
+#define EMIT_WARN(skip, errlist)                                               \
+  {                                                                            \
+    char ch;                                                                   \
+    if (skip) {                                                                \
+      SKIP_TO(next, ',');                                                      \
+    }                                                                          \
+    ch = *next;                                                                \
+    *next = '\0';                                                              \
+    KMP_WARNING errlist;                                                       \
+    *next = ch;                                                                \
+    if (skip) {                                                                \
+      if (ch == ',')                                                           \
+        next++;                                                                \
+    }                                                                          \
+    buf = next;                                                                \
+  }
+
+#define _set_param(_guard, _var, _val)                                         \
+  {                                                                            \
+    if (_guard == 0) {                                                         \
+      _var = _val;                                                             \
+    } else {                                                                   \
+      EMIT_WARN(FALSE, (AffParamDefined, name, start));                        \
+    }                                                                          \
+    ++_guard;                                                                  \
+  }
+
+#define set_type(val) _set_param(type, *out_type, val)
+#define set_verbose(val) _set_param(verbose, *out_verbose, val)
+#define set_warnings(val) _set_param(warnings, *out_warn, val)
+#define set_respect(val) _set_param(respect, *out_respect, val)
+#define set_dups(val) _set_param(dups, *out_dups, val)
+#define set_proclist(val) _set_param(proclist, *out_proclist, val)
+
+#define set_gran(val, levels)                                                  \
+  {                                                                            \
+    if (gran == 0) {                                                           \
+      *out_gran = val;                                                         \
+      *out_gran_levels = levels;                                               \
+    } else {                                                                   \
+      EMIT_WARN(FALSE, (AffParamDefined, name, start));                        \
+    }                                                                          \
+    ++gran;                                                                    \
+  }
+
+  KMP_DEBUG_ASSERT((__kmp_nested_proc_bind.bind_types != NULL) &&
+                   (__kmp_nested_proc_bind.used > 0));
+
+  while (*buf != '\0') {
+    start = next = buf;
+
+    if (__kmp_match_str("none", buf, CCAST(const char **, &next))) {
+      set_type(affinity_none);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+      buf = next;
+    } else if (__kmp_match_str("scatter", buf, CCAST(const char **, &next))) {
+      set_type(affinity_scatter);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("compact", buf, CCAST(const char **, &next))) {
+      set_type(affinity_compact);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("logical", buf, CCAST(const char **, &next))) {
+      set_type(affinity_logical);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("physical", buf, CCAST(const char **, &next))) {
+      set_type(affinity_physical);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("explicit", buf, CCAST(const char **, &next))) {
+      set_type(affinity_explicit);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("balanced", buf, CCAST(const char **, &next))) {
+      set_type(affinity_balanced);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+      buf = next;
+    } else if (__kmp_match_str("disabled", buf, CCAST(const char **, &next))) {
+      set_type(affinity_disabled);
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+      buf = next;
+    } else if (__kmp_match_str("verbose", buf, CCAST(const char **, &next))) {
+      set_verbose(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("noverbose", buf, CCAST(const char **, &next))) {
+      set_verbose(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("warnings", buf, CCAST(const char **, &next))) {
+      set_warnings(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("nowarnings", buf,
+                               CCAST(const char **, &next))) {
+      set_warnings(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("respect", buf, CCAST(const char **, &next))) {
+      set_respect(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("norespect", buf, CCAST(const char **, &next))) {
+      set_respect(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("duplicates", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("dups", buf, CCAST(const char **, &next))) {
+      set_dups(TRUE);
+      buf = next;
+    } else if (__kmp_match_str("noduplicates", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("nodups", buf, CCAST(const char **, &next))) {
+      set_dups(FALSE);
+      buf = next;
+    } else if (__kmp_match_str("granularity", buf,
+                               CCAST(const char **, &next)) ||
+               __kmp_match_str("gran", buf, CCAST(const char **, &next))) {
+      SKIP_WS(next);
+      if (*next != '=') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip '='
+      SKIP_WS(next);
+
+      buf = next;
+      if (__kmp_match_str("fine", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_fine, -1);
+        buf = next;
+      } else if (__kmp_match_str("thread", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_thread, -1);
+        buf = next;
+      } else if (__kmp_match_str("core", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_core, -1);
+        buf = next;
+#if KMP_USE_HWLOC
+      } else if (__kmp_match_str("tile", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_tile, -1);
+        buf = next;
+#endif
+      } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_package, -1);
+        buf = next;
+      } else if (__kmp_match_str("node", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_node, -1);
+        buf = next;
+#if KMP_GROUP_AFFINITY
+      } else if (__kmp_match_str("group", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_group, -1);
+        buf = next;
+#endif /* KMP_GROUP AFFINITY */
+      } else if ((*buf >= '0') && (*buf <= '9')) {
+        int n;
+        next = buf;
+        SKIP_DIGITS(next);
+        n = __kmp_str_to_int(buf, *next);
+        KMP_ASSERT(n >= 0);
+        buf = next;
+        set_gran(affinity_gran_default, n);
+      } else {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+    } else if (__kmp_match_str("proclist", buf, CCAST(const char **, &next))) {
+      char *temp_proclist;
+
+      SKIP_WS(next);
+      if (*next != '=') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip '='
+      SKIP_WS(next);
+      if (*next != '[') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip '['
+      buf = next;
+      if (!__kmp_parse_affinity_proc_id_list(
+              name, buf, CCAST(const char **, &next), &temp_proclist)) {
+        // warning already emitted.
+        SKIP_TO(next, ']');
+        if (*next == ']')
+          next++;
+        SKIP_TO(next, ',');
+        if (*next == ',')
+          next++;
+        buf = next;
+        continue;
+      }
+      if (*next != ']') {
+        EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+        continue;
+      }
+      next++; // skip ']'
+      set_proclist(temp_proclist);
+    } else if ((*buf >= '0') && (*buf <= '9')) {
+      // Parse integer numbers -- permute and offset.
+      int n;
+      next = buf;
+      SKIP_DIGITS(next);
+      n = __kmp_str_to_int(buf, *next);
+      KMP_ASSERT(n >= 0);
+      buf = next;
+      if (count < 2) {
+        number[count] = n;
+      } else {
+        KMP_WARNING(AffManyParams, name, start);
+      }
+      ++count;
+    } else {
+      EMIT_WARN(TRUE, (AffInvalidParam, name, start));
+      continue;
+    }
+
+    SKIP_WS(next);
+    if (*next == ',') {
+      next++;
+      SKIP_WS(next);
+    } else if (*next != '\0') {
+      const char *temp = next;
+      EMIT_WARN(TRUE, (ParseExtraCharsWarn, name, temp));
+      continue;
+    }
+    buf = next;
+  } // while
+
+#undef EMIT_WARN
+#undef _set_param
+#undef set_type
+#undef set_verbose
+#undef set_warnings
+#undef set_respect
+#undef set_granularity
+
+  __kmp_str_free(&buffer);
+
+  if (proclist) {
+    if (!type) {
+      KMP_WARNING(AffProcListNoType, name);
+      *out_type = affinity_explicit;
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+    } else if (*out_type != affinity_explicit) {
+      KMP_WARNING(AffProcListNotExplicit, name);
+      KMP_ASSERT(*out_proclist != NULL);
+      KMP_INTERNAL_FREE(*out_proclist);
+      *out_proclist = NULL;
+    }
+  }
+  switch (*out_type) {
+  case affinity_logical:
+  case affinity_physical: {
+    if (count > 0) {
+      *out_offset = number[0];
+    }
+    if (count > 1) {
+      KMP_WARNING(AffManyParamsForLogic, name, number[1]);
+    }
+  } break;
+  case affinity_balanced: {
+    if (count > 0) {
+      *out_compact = number[0];
+    }
+    if (count > 1) {
+      *out_offset = number[1];
+    }
+
+    if (__kmp_affinity_gran == affinity_gran_default) {
+#if KMP_MIC_SUPPORTED
+      if (__kmp_mic_type != non_mic) {
+        if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
+          KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "fine");
+        }
+        __kmp_affinity_gran = affinity_gran_fine;
+      } else
+#endif
+      {
+        if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
+          KMP_WARNING(AffGranUsing, "KMP_AFFINITY", "core");
+        }
+        __kmp_affinity_gran = affinity_gran_core;
+      }
+    }
+  } break;
+  case affinity_scatter:
+  case affinity_compact: {
+    if (count > 0) {
+      *out_compact = number[0];
+    }
+    if (count > 1) {
+      *out_offset = number[1];
+    }
+  } break;
+  case affinity_explicit: {
+    if (*out_proclist == NULL) {
+      KMP_WARNING(AffNoProcList, name);
+      __kmp_affinity_type = affinity_none;
+    }
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "explicit");
+    }
+  } break;
+  case affinity_none: {
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "none");
+    }
+  } break;
+  case affinity_disabled: {
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "disabled");
+    }
+  } break;
+  case affinity_default: {
+    if (count > 0) {
+      KMP_WARNING(AffNoParam, name, "default");
+    }
+  } break;
+  default: { KMP_ASSERT(0); }
+  }
+} // __kmp_parse_affinity_env
+
+static void __kmp_stg_parse_affinity(char const *name, char const *value,
+                                     void *data) {
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  __kmp_parse_affinity_env(name, value, &__kmp_affinity_type,
+                           &__kmp_affinity_proclist, &__kmp_affinity_verbose,
+                           &__kmp_affinity_warnings,
+                           &__kmp_affinity_respect_mask, &__kmp_affinity_gran,
+                           &__kmp_affinity_gran_levels, &__kmp_affinity_dups,
+                           &__kmp_affinity_compact, &__kmp_affinity_offset);
+
+} // __kmp_stg_parse_affinity
+
+static void __kmp_stg_print_affinity(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  if (__kmp_affinity_verbose) {
+    __kmp_str_buf_print(buffer, "%s,", "verbose");
+  } else {
+    __kmp_str_buf_print(buffer, "%s,", "noverbose");
+  }
+  if (__kmp_affinity_warnings) {
+    __kmp_str_buf_print(buffer, "%s,", "warnings");
+  } else {
+    __kmp_str_buf_print(buffer, "%s,", "nowarnings");
+  }
+  if (KMP_AFFINITY_CAPABLE()) {
+    if (__kmp_affinity_respect_mask) {
+      __kmp_str_buf_print(buffer, "%s,", "respect");
+    } else {
+      __kmp_str_buf_print(buffer, "%s,", "norespect");
+    }
+    switch (__kmp_affinity_gran) {
+    case affinity_gran_default:
+      __kmp_str_buf_print(buffer, "%s", "granularity=default,");
+      break;
+    case affinity_gran_fine:
+      __kmp_str_buf_print(buffer, "%s", "granularity=fine,");
+      break;
+    case affinity_gran_thread:
+      __kmp_str_buf_print(buffer, "%s", "granularity=thread,");
+      break;
+    case affinity_gran_core:
+      __kmp_str_buf_print(buffer, "%s", "granularity=core,");
+      break;
+    case affinity_gran_package:
+      __kmp_str_buf_print(buffer, "%s", "granularity=package,");
+      break;
+    case affinity_gran_node:
+      __kmp_str_buf_print(buffer, "%s", "granularity=node,");
+      break;
+#if KMP_GROUP_AFFINITY
+    case affinity_gran_group:
+      __kmp_str_buf_print(buffer, "%s", "granularity=group,");
+      break;
+#endif /* KMP_GROUP_AFFINITY */
+    }
+  }
+  if (!KMP_AFFINITY_CAPABLE()) {
+    __kmp_str_buf_print(buffer, "%s", "disabled");
+  } else
+    switch (__kmp_affinity_type) {
+    case affinity_none:
+      __kmp_str_buf_print(buffer, "%s", "none");
+      break;
+    case affinity_physical:
+      __kmp_str_buf_print(buffer, "%s,%d", "physical", __kmp_affinity_offset);
+      break;
+    case affinity_logical:
+      __kmp_str_buf_print(buffer, "%s,%d", "logical", __kmp_affinity_offset);
+      break;
+    case affinity_compact:
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "compact", __kmp_affinity_compact,
+                          __kmp_affinity_offset);
+      break;
+    case affinity_scatter:
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "scatter", __kmp_affinity_compact,
+                          __kmp_affinity_offset);
+      break;
+    case affinity_explicit:
+      __kmp_str_buf_print(buffer, "%s=[%s],%s", "proclist",
+                          __kmp_affinity_proclist, "explicit");
+      break;
+    case affinity_balanced:
+      __kmp_str_buf_print(buffer, "%s,%d,%d", "balanced",
+                          __kmp_affinity_compact, __kmp_affinity_offset);
+      break;
+    case affinity_disabled:
+      __kmp_str_buf_print(buffer, "%s", "disabled");
+      break;
+    case affinity_default:
+      __kmp_str_buf_print(buffer, "%s", "default");
+      break;
+    default:
+      __kmp_str_buf_print(buffer, "%s", "<unknown>");
+      break;
+    }
+  __kmp_str_buf_print(buffer, "'\n");
+} //__kmp_stg_print_affinity
+
+#ifdef KMP_GOMP_COMPAT
+
+static void __kmp_stg_parse_gomp_cpu_affinity(char const *name,
+                                              char const *value, void *data) {
+  const char *next = NULL;
+  char *temp_proclist;
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  if (TCR_4(__kmp_init_middle)) {
+    KMP_WARNING(EnvMiddleWarn, name);
+    __kmp_env_toPrint(name, 0);
+    return;
+  }
+
+  __kmp_env_toPrint(name, 1);
+
+  if (__kmp_parse_affinity_proc_id_list(name, value, &next, &temp_proclist)) {
+    SKIP_WS(next);
+    if (*next == '\0') {
+      // GOMP_CPU_AFFINITY => granularity=fine,explicit,proclist=...
+      __kmp_affinity_proclist = temp_proclist;
+      __kmp_affinity_type = affinity_explicit;
+      __kmp_affinity_gran = affinity_gran_fine;
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+    } else {
+      KMP_WARNING(AffSyntaxError, name);
+      if (temp_proclist != NULL) {
+        KMP_INTERNAL_FREE((void *)temp_proclist);
+      }
+    }
+  } else {
+    // Warning already emitted
+    __kmp_affinity_type = affinity_none;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  }
+} // __kmp_stg_parse_gomp_cpu_affinity
+
+#endif /* KMP_GOMP_COMPAT */
+
+/*-----------------------------------------------------------------------------
+The OMP_PLACES proc id list parser. Here is the grammar:
+
+place_list := place
+place_list := place , place_list
+place := num
+place := place : num
+place := place : num : signed
+place := { subplacelist }
+place := ! place                  // (lowest priority)
+subplace_list := subplace
+subplace_list := subplace , subplace_list
+subplace := num
+subplace := num : num
+subplace := num : num : signed
+signed := num
+signed := + signed
+signed := - signed
+-----------------------------------------------------------------------------*/
+
+static int __kmp_parse_subplace_list(const char *var, const char **scan) {
+  const char *next;
+
+  for (;;) {
+    int start, count, stride;
+
+    //
+    // Read in the starting proc id
+    //
+    SKIP_WS(*scan);
+    if ((**scan < '0') || (**scan > '9')) {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    next = *scan;
+    SKIP_DIGITS(next);
+    start = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(start >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}') {
+      break;
+    }
+    if (**scan == ',') {
+      (*scan)++; // skip ','
+      continue;
+    }
+    if (**scan != ':') {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    (*scan)++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(*scan);
+    if ((**scan < '0') || (**scan > '9')) {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    next = *scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(count >= 0);
+    *scan = next;
+
+    // valid follow sets are ',' ':' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}') {
+      break;
+    }
+    if (**scan == ',') {
+      (*scan)++; // skip ','
+      continue;
+    }
+    if (**scan != ':') {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    (*scan)++; // skip ':'
+
+    // Read stride parameter
+    int sign = +1;
+    for (;;) {
+      SKIP_WS(*scan);
+      if (**scan == '+') {
+        (*scan)++; // skip '+'
+        continue;
+      }
+      if (**scan == '-') {
+        sign *= -1;
+        (*scan)++; // skip '-'
+        continue;
+      }
+      break;
+    }
+    SKIP_WS(*scan);
+    if ((**scan < '0') || (**scan > '9')) {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    next = *scan;
+    SKIP_DIGITS(next);
+    stride = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(stride >= 0);
+    *scan = next;
+    stride *= sign;
+
+    // valid follow sets are ',' and '}'
+    SKIP_WS(*scan);
+    if (**scan == '}') {
+      break;
+    }
+    if (**scan == ',') {
+      (*scan)++; // skip ','
+      continue;
+    }
+
+    KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+    return FALSE;
+  }
+  return TRUE;
+}
+
+static int __kmp_parse_place(const char *var, const char **scan) {
+  const char *next;
+
+  // valid follow sets are '{' '!' and num
+  SKIP_WS(*scan);
+  if (**scan == '{') {
+    (*scan)++; // skip '{'
+    if (!__kmp_parse_subplace_list(var, scan)) {
+      return FALSE;
+    }
+    if (**scan != '}') {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    (*scan)++; // skip '}'
+  } else if (**scan == '!') {
+    (*scan)++; // skip '!'
+    return __kmp_parse_place(var, scan); //'!' has lower precedence than ':'
+  } else if ((**scan >= '0') && (**scan <= '9')) {
+    next = *scan;
+    SKIP_DIGITS(next);
+    int proc = __kmp_str_to_int(*scan, *next);
+    KMP_ASSERT(proc >= 0);
+    *scan = next;
+  } else {
+    KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+    return FALSE;
+  }
+  return TRUE;
+}
+
+static int __kmp_parse_place_list(const char *var, const char *env,
+                                  char **place_list) {
+  const char *scan = env;
+  const char *next = scan;
+
+  for (;;) {
+    int count, stride;
+
+    if (!__kmp_parse_place(var, &scan)) {
+      return FALSE;
+    }
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+    if (*scan != ':') {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    scan++; // skip ':'
+
+    // Read count parameter
+    SKIP_WS(scan);
+    if ((*scan < '0') || (*scan > '9')) {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    next = scan;
+    SKIP_DIGITS(next);
+    count = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(count >= 0);
+    scan = next;
+
+    // valid follow sets are ',' ':' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+    if (*scan != ':') {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    scan++; // skip ':'
+
+    // Read stride parameter
+    int sign = +1;
+    for (;;) {
+      SKIP_WS(scan);
+      if (*scan == '+') {
+        scan++; // skip '+'
+        continue;
+      }
+      if (*scan == '-') {
+        sign *= -1;
+        scan++; // skip '-'
+        continue;
+      }
+      break;
+    }
+    SKIP_WS(scan);
+    if ((*scan < '0') || (*scan > '9')) {
+      KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+      return FALSE;
+    }
+    next = scan;
+    SKIP_DIGITS(next);
+    stride = __kmp_str_to_int(scan, *next);
+    KMP_ASSERT(stride >= 0);
+    scan = next;
+    stride *= sign;
+
+    // valid follow sets are ',' and EOL
+    SKIP_WS(scan);
+    if (*scan == '\0') {
+      break;
+    }
+    if (*scan == ',') {
+      scan++; // skip ','
+      continue;
+    }
+
+    KMP_WARNING(SyntaxErrorUsing, var, "\"threads\"");
+    return FALSE;
+  }
+
+  {
+    int len = scan - env;
+    char *retlist = (char *)__kmp_allocate((len + 1) * sizeof(char));
+    KMP_MEMCPY_S(retlist, (len + 1) * sizeof(char), env, len * sizeof(char));
+    retlist[len] = '\0';
+    *place_list = retlist;
+  }
+  return TRUE;
+}
+
+static void __kmp_stg_parse_places(char const *name, char const *value,
+                                   void *data) {
+  int count;
+  const char *scan = value;
+  const char *next = scan;
+  const char *kind = "\"threads\"";
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  // If OMP_PROC_BIND is not specified but OMP_PLACES is,
+  // then let OMP_PROC_BIND default to true.
+  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+  }
+
+  //__kmp_affinity_num_places = 0;
+
+  if (__kmp_match_str("threads", scan, &next)) {
+    scan = next;
+    __kmp_affinity_type = affinity_compact;
+    __kmp_affinity_gran = affinity_gran_thread;
+    __kmp_affinity_dups = FALSE;
+    kind = "\"threads\"";
+  } else if (__kmp_match_str("cores", scan, &next)) {
+    scan = next;
+    __kmp_affinity_type = affinity_compact;
+    __kmp_affinity_gran = affinity_gran_core;
+    __kmp_affinity_dups = FALSE;
+    kind = "\"cores\"";
+#if KMP_USE_HWLOC
+  } else if (__kmp_match_str("tiles", scan, &next)) {
+    scan = next;
+    __kmp_affinity_type = affinity_compact;
+    __kmp_affinity_gran = affinity_gran_tile;
+    __kmp_affinity_dups = FALSE;
+    kind = "\"tiles\"";
+#endif
+  } else if (__kmp_match_str("sockets", scan, &next)) {
+    scan = next;
+    __kmp_affinity_type = affinity_compact;
+    __kmp_affinity_gran = affinity_gran_package;
+    __kmp_affinity_dups = FALSE;
+    kind = "\"sockets\"";
+  } else {
+    if (__kmp_affinity_proclist != NULL) {
+      KMP_INTERNAL_FREE((void *)__kmp_affinity_proclist);
+      __kmp_affinity_proclist = NULL;
+    }
+    if (__kmp_parse_place_list(name, value, &__kmp_affinity_proclist)) {
+      __kmp_affinity_type = affinity_explicit;
+      __kmp_affinity_gran = affinity_gran_fine;
+      __kmp_affinity_dups = FALSE;
+      if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+      }
+    }
+    return;
+  }
+
+  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+  }
+
+  SKIP_WS(scan);
+  if (*scan == '\0') {
+    return;
+  }
+
+  // Parse option count parameter in parentheses
+  if (*scan != '(') {
+    KMP_WARNING(SyntaxErrorUsing, name, kind);
+    return;
+  }
+  scan++; // skip '('
+
+  SKIP_WS(scan);
+  next = scan;
+  SKIP_DIGITS(next);
+  count = __kmp_str_to_int(scan, *next);
+  KMP_ASSERT(count >= 0);
+  scan = next;
+
+  SKIP_WS(scan);
+  if (*scan != ')') {
+    KMP_WARNING(SyntaxErrorUsing, name, kind);
+    return;
+  }
+  scan++; // skip ')'
+
+  SKIP_WS(scan);
+  if (*scan != '\0') {
+    KMP_WARNING(ParseExtraCharsWarn, name, scan);
+  }
+  __kmp_affinity_num_places = count;
+}
+
+static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
+                                   void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if ((__kmp_nested_proc_bind.used == 0) ||
+      (__kmp_nested_proc_bind.bind_types == NULL) ||
+      (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  } else if (__kmp_affinity_type == affinity_explicit) {
+    if (__kmp_affinity_proclist != NULL) {
+      __kmp_str_buf_print(buffer, "='%s'\n", __kmp_affinity_proclist);
+    } else {
+      __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+    }
+  } else if (__kmp_affinity_type == affinity_compact) {
+    int num;
+    if (__kmp_affinity_num_masks > 0) {
+      num = __kmp_affinity_num_masks;
+    } else if (__kmp_affinity_num_places > 0) {
+      num = __kmp_affinity_num_places;
+    } else {
+      num = 0;
+    }
+    if (__kmp_affinity_gran == affinity_gran_thread) {
+      if (num > 0) {
+        __kmp_str_buf_print(buffer, "='threads(%d)'\n", num);
+      } else {
+        __kmp_str_buf_print(buffer, "='threads'\n");
+      }
+    } else if (__kmp_affinity_gran == affinity_gran_core) {
+      if (num > 0) {
+        __kmp_str_buf_print(buffer, "='cores(%d)' \n", num);
+      } else {
+        __kmp_str_buf_print(buffer, "='cores'\n");
+      }
+#if KMP_USE_HWLOC
+    } else if (__kmp_affinity_gran == affinity_gran_tile) {
+      if (num > 0) {
+        __kmp_str_buf_print(buffer, "='tiles(%d)' \n", num);
+      } else {
+        __kmp_str_buf_print(buffer, "='tiles'\n");
+      }
+#endif
+    } else if (__kmp_affinity_gran == affinity_gran_package) {
+      if (num > 0) {
+        __kmp_str_buf_print(buffer, "='sockets(%d)'\n", num);
+      } else {
+        __kmp_str_buf_print(buffer, "='sockets'\n");
+      }
+    } else {
+      __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+    }
+  } else {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+}
+
+static void __kmp_stg_parse_topology_method(char const *name, char const *value,
+                                            void *data) {
+  if (__kmp_str_match("all", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_all;
+  }
+#if KMP_USE_HWLOC
+  else if (__kmp_str_match("hwloc", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_hwloc;
+  }
+#endif
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  else if (__kmp_str_match("x2apic id", 9, value) ||
+           __kmp_str_match("x2apic_id", 9, value) ||
+           __kmp_str_match("x2apic-id", 9, value) ||
+           __kmp_str_match("x2apicid", 8, value) ||
+           __kmp_str_match("cpuid leaf 11", 13, value) ||
+           __kmp_str_match("cpuid_leaf_11", 13, value) ||
+           __kmp_str_match("cpuid-leaf-11", 13, value) ||
+           __kmp_str_match("cpuid leaf11", 12, value) ||
+           __kmp_str_match("cpuid_leaf11", 12, value) ||
+           __kmp_str_match("cpuid-leaf11", 12, value) ||
+           __kmp_str_match("cpuidleaf 11", 12, value) ||
+           __kmp_str_match("cpuidleaf_11", 12, value) ||
+           __kmp_str_match("cpuidleaf-11", 12, value) ||
+           __kmp_str_match("cpuidleaf11", 11, value) ||
+           __kmp_str_match("cpuid 11", 8, value) ||
+           __kmp_str_match("cpuid_11", 8, value) ||
+           __kmp_str_match("cpuid-11", 8, value) ||
+           __kmp_str_match("cpuid11", 7, value) ||
+           __kmp_str_match("leaf 11", 7, value) ||
+           __kmp_str_match("leaf_11", 7, value) ||
+           __kmp_str_match("leaf-11", 7, value) ||
+           __kmp_str_match("leaf11", 6, value)) {
+    __kmp_affinity_top_method = affinity_top_method_x2apicid;
+  } else if (__kmp_str_match("apic id", 7, value) ||
+             __kmp_str_match("apic_id", 7, value) ||
+             __kmp_str_match("apic-id", 7, value) ||
+             __kmp_str_match("apicid", 6, value) ||
+             __kmp_str_match("cpuid leaf 4", 12, value) ||
+             __kmp_str_match("cpuid_leaf_4", 12, value) ||
+             __kmp_str_match("cpuid-leaf-4", 12, value) ||
+             __kmp_str_match("cpuid leaf4", 11, value) ||
+             __kmp_str_match("cpuid_leaf4", 11, value) ||
+             __kmp_str_match("cpuid-leaf4", 11, value) ||
+             __kmp_str_match("cpuidleaf 4", 11, value) ||
+             __kmp_str_match("cpuidleaf_4", 11, value) ||
+             __kmp_str_match("cpuidleaf-4", 11, value) ||
+             __kmp_str_match("cpuidleaf4", 10, value) ||
+             __kmp_str_match("cpuid 4", 7, value) ||
+             __kmp_str_match("cpuid_4", 7, value) ||
+             __kmp_str_match("cpuid-4", 7, value) ||
+             __kmp_str_match("cpuid4", 6, value) ||
+             __kmp_str_match("leaf 4", 6, value) ||
+             __kmp_str_match("leaf_4", 6, value) ||
+             __kmp_str_match("leaf-4", 6, value) ||
+             __kmp_str_match("leaf4", 5, value)) {
+    __kmp_affinity_top_method = affinity_top_method_apicid;
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+  else if (__kmp_str_match("/proc/cpuinfo", 2, value) ||
+           __kmp_str_match("cpuinfo", 5, value)) {
+    __kmp_affinity_top_method = affinity_top_method_cpuinfo;
+  }
+#if KMP_GROUP_AFFINITY
+  else if (__kmp_str_match("group", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_group;
+  }
+#endif /* KMP_GROUP_AFFINITY */
+  else if (__kmp_str_match("flat", 1, value)) {
+    __kmp_affinity_top_method = affinity_top_method_flat;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} // __kmp_stg_parse_topology_method
+
+static void __kmp_stg_print_topology_method(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  char const *value = NULL;
+
+  switch (__kmp_affinity_top_method) {
+  case affinity_top_method_default:
+    value = "default";
+    break;
+
+  case affinity_top_method_all:
+    value = "all";
+    break;
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  case affinity_top_method_x2apicid:
+    value = "x2APIC id";
+    break;
+
+  case affinity_top_method_apicid:
+    value = "APIC id";
+    break;
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#if KMP_USE_HWLOC
+  case affinity_top_method_hwloc:
+    value = "hwloc";
+    break;
+#endif
+
+  case affinity_top_method_cpuinfo:
+    value = "cpuinfo";
+    break;
+
+#if KMP_GROUP_AFFINITY
+  case affinity_top_method_group:
+    value = "group";
+    break;
+#endif /* KMP_GROUP_AFFINITY */
+
+  case affinity_top_method_flat:
+    value = "flat";
+    break;
+  }
+
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+} // __kmp_stg_print_topology_method
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+// OMP_PROC_BIND / bind-var is functional on all 4.0 builds, including OS X*
+// OMP_PLACES / place-partition-var is not.
+static void __kmp_stg_parse_proc_bind(char const *name, char const *value,
+                                      void *data) {
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  int rc;
+
+  rc = __kmp_stg_check_rivals(name, value, rivals);
+  if (rc) {
+    return;
+  }
+
+  // In OMP 4.0 OMP_PROC_BIND is a vector of proc_bind types.
+  KMP_DEBUG_ASSERT((__kmp_nested_proc_bind.bind_types != NULL) &&
+                   (__kmp_nested_proc_bind.used > 0));
+
+  const char *buf = value;
+  const char *next;
+  int num;
+  SKIP_WS(buf);
+  if ((*buf >= '0') && (*buf <= '9')) {
+    next = buf;
+    SKIP_DIGITS(next);
+    num = __kmp_str_to_int(buf, *next);
+    KMP_ASSERT(num >= 0);
+    buf = next;
+    SKIP_WS(buf);
+  } else {
+    num = -1;
+  }
+
+  next = buf;
+  if (__kmp_match_str("disabled", buf, &next)) {
+    buf = next;
+    SKIP_WS(buf);
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_affinity_type = affinity_disabled;
+#endif /* KMP_AFFINITY_SUPPORTED */
+    __kmp_nested_proc_bind.used = 1;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  } else if ((num == (int)proc_bind_false) ||
+             __kmp_match_str("false", buf, &next)) {
+    buf = next;
+    SKIP_WS(buf);
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_affinity_type = affinity_none;
+#endif /* KMP_AFFINITY_SUPPORTED */
+    __kmp_nested_proc_bind.used = 1;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  } else if ((num == (int)proc_bind_true) ||
+             __kmp_match_str("true", buf, &next)) {
+    buf = next;
+    SKIP_WS(buf);
+    __kmp_nested_proc_bind.used = 1;
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+  } else {
+    // Count the number of values in the env var string
+    const char *scan;
+    int nelem = 1;
+    for (scan = buf; *scan != '\0'; scan++) {
+      if (*scan == ',') {
+        nelem++;
+      }
+    }
+
+    // Create / expand the nested proc_bind array as needed
+    if (__kmp_nested_proc_bind.size < nelem) {
+      __kmp_nested_proc_bind.bind_types =
+          (kmp_proc_bind_t *)KMP_INTERNAL_REALLOC(
+              __kmp_nested_proc_bind.bind_types,
+              sizeof(kmp_proc_bind_t) * nelem);
+      if (__kmp_nested_proc_bind.bind_types == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+      __kmp_nested_proc_bind.size = nelem;
+    }
+    __kmp_nested_proc_bind.used = nelem;
+
+    if (nelem > 1 && !__kmp_dflt_max_active_levels_set)
+      __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
+
+    // Save values in the nested proc_bind array
+    int i = 0;
+    for (;;) {
+      enum kmp_proc_bind_t bind;
+
+      if ((num == (int)proc_bind_master) ||
+          __kmp_match_str("master", buf, &next)) {
+        buf = next;
+        SKIP_WS(buf);
+        bind = proc_bind_master;
+      } else if ((num == (int)proc_bind_close) ||
+                 __kmp_match_str("close", buf, &next)) {
+        buf = next;
+        SKIP_WS(buf);
+        bind = proc_bind_close;
+      } else if ((num == (int)proc_bind_spread) ||
+                 __kmp_match_str("spread", buf, &next)) {
+        buf = next;
+        SKIP_WS(buf);
+        bind = proc_bind_spread;
+      } else {
+        KMP_WARNING(StgInvalidValue, name, value);
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+        __kmp_nested_proc_bind.used = 1;
+        return;
+      }
+
+      __kmp_nested_proc_bind.bind_types[i++] = bind;
+      if (i >= nelem) {
+        break;
+      }
+      KMP_DEBUG_ASSERT(*buf == ',');
+      buf++;
+      SKIP_WS(buf);
+
+      // Read next value if it was specified as an integer
+      if ((*buf >= '0') && (*buf <= '9')) {
+        next = buf;
+        SKIP_DIGITS(next);
+        num = __kmp_str_to_int(buf, *next);
+        KMP_ASSERT(num >= 0);
+        buf = next;
+        SKIP_WS(buf);
+      } else {
+        num = -1;
+      }
+    }
+    SKIP_WS(buf);
+  }
+  if (*buf != '\0') {
+    KMP_WARNING(ParseExtraCharsWarn, name, buf);
+  }
+}
+
+static void __kmp_stg_print_proc_bind(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  int nelem = __kmp_nested_proc_bind.used;
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME;
+  } else {
+    __kmp_str_buf_print(buffer, "   %s", name);
+  }
+  if (nelem == 0) {
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  } else {
+    int i;
+    __kmp_str_buf_print(buffer, "='", name);
+    for (i = 0; i < nelem; i++) {
+      switch (__kmp_nested_proc_bind.bind_types[i]) {
+      case proc_bind_false:
+        __kmp_str_buf_print(buffer, "false");
+        break;
+
+      case proc_bind_true:
+        __kmp_str_buf_print(buffer, "true");
+        break;
+
+      case proc_bind_master:
+        __kmp_str_buf_print(buffer, "master");
+        break;
+
+      case proc_bind_close:
+        __kmp_str_buf_print(buffer, "close");
+        break;
+
+      case proc_bind_spread:
+        __kmp_str_buf_print(buffer, "spread");
+        break;
+
+      case proc_bind_intel:
+        __kmp_str_buf_print(buffer, "intel");
+        break;
+
+      case proc_bind_default:
+        __kmp_str_buf_print(buffer, "default");
+        break;
+      }
+      if (i < nelem - 1) {
+        __kmp_str_buf_print(buffer, ",");
+      }
+    }
+    __kmp_str_buf_print(buffer, "'\n");
+  }
+}
+
+static void __kmp_stg_parse_display_affinity(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_display_affinity);
+}
+static void __kmp_stg_print_display_affinity(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_display_affinity);
+}
+static void __kmp_stg_parse_affinity_format(char const *name, char const *value,
+                                            void *data) {
+  size_t length = KMP_STRLEN(value);
+  __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE, value,
+                         length);
+}
+static void __kmp_stg_print_affinity_format(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  __kmp_str_buf_print(buffer, "%s'\n", __kmp_affinity_format);
+}
+// OMP_ALLOCATOR sets default allocator
+static void __kmp_stg_parse_allocator(char const *name, char const *value,
+                                      void *data) {
+  /*
+    The value can be any predefined allocator:
+    omp_default_mem_alloc = 1;
+    omp_large_cap_mem_alloc = 2;
+    omp_const_mem_alloc = 3;
+    omp_high_bw_mem_alloc = 4;
+    omp_low_lat_mem_alloc = 5;
+    omp_cgroup_mem_alloc = 6;
+    omp_pteam_mem_alloc = 7;
+    omp_thread_mem_alloc = 8;
+    Acceptable value is either a digit or a string.
+  */
+  const char *buf = value;
+  const char *next;
+  int num;
+  SKIP_WS(buf);
+  if ((*buf > '0') && (*buf < '9')) {
+    next = buf;
+    SKIP_DIGITS(next);
+    num = __kmp_str_to_int(buf, *next);
+    KMP_ASSERT(num > 0);
+    switch (num) {
+    case 4:
+      if (__kmp_memkind_available) {
+        __kmp_def_allocator = omp_high_bw_mem_alloc;
+      } else {
+        __kmp_msg(kmp_ms_warning,
+                  KMP_MSG(OmpNoAllocator, "omp_high_bw_mem_alloc"),
+                  __kmp_msg_null);
+        __kmp_def_allocator = omp_default_mem_alloc;
+      }
+      break;
+    case 1:
+      __kmp_def_allocator = omp_default_mem_alloc;
+      break;
+    case 2:
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(OmpNoAllocator, "omp_large_cap_mem_alloc"),
+                __kmp_msg_null);
+      __kmp_def_allocator = omp_default_mem_alloc;
+      break;
+    case 3:
+      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_const_mem_alloc"),
+                __kmp_msg_null);
+      __kmp_def_allocator = omp_default_mem_alloc;
+      break;
+    case 5:
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(OmpNoAllocator, "omp_low_lat_mem_alloc"),
+                __kmp_msg_null);
+      __kmp_def_allocator = omp_default_mem_alloc;
+      break;
+    case 6:
+      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_cgroup_mem_alloc"),
+                __kmp_msg_null);
+      __kmp_def_allocator = omp_default_mem_alloc;
+      break;
+    case 7:
+      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_pteam_mem_alloc"),
+                __kmp_msg_null);
+      __kmp_def_allocator = omp_default_mem_alloc;
+      break;
+    case 8:
+      __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_thread_mem_alloc"),
+                __kmp_msg_null);
+      __kmp_def_allocator = omp_default_mem_alloc;
+      break;
+    }
+    return;
+  }
+  next = buf;
+  if (__kmp_match_str("omp_high_bw_mem_alloc", buf, &next)) {
+    if (__kmp_memkind_available) {
+      __kmp_def_allocator = omp_high_bw_mem_alloc;
+    } else {
+      __kmp_msg(kmp_ms_warning,
+                KMP_MSG(OmpNoAllocator, "omp_high_bw_mem_alloc"),
+                __kmp_msg_null);
+      __kmp_def_allocator = omp_default_mem_alloc;
+    }
+  } else if (__kmp_match_str("omp_default_mem_alloc", buf, &next)) {
+    __kmp_def_allocator = omp_default_mem_alloc;
+  } else if (__kmp_match_str("omp_large_cap_mem_alloc", buf, &next)) {
+    __kmp_msg(kmp_ms_warning,
+              KMP_MSG(OmpNoAllocator, "omp_large_cap_mem_alloc"),
+              __kmp_msg_null);
+    __kmp_def_allocator = omp_default_mem_alloc;
+  } else if (__kmp_match_str("omp_const_mem_alloc", buf, &next)) {
+    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_const_mem_alloc"),
+              __kmp_msg_null);
+    __kmp_def_allocator = omp_default_mem_alloc;
+  } else if (__kmp_match_str("omp_low_lat_mem_alloc", buf, &next)) {
+    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_low_lat_mem_alloc"),
+              __kmp_msg_null);
+    __kmp_def_allocator = omp_default_mem_alloc;
+  } else if (__kmp_match_str("omp_cgroup_mem_alloc", buf, &next)) {
+    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_cgroup_mem_alloc"),
+              __kmp_msg_null);
+    __kmp_def_allocator = omp_default_mem_alloc;
+  } else if (__kmp_match_str("omp_pteam_mem_alloc", buf, &next)) {
+    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_pteam_mem_alloc"),
+              __kmp_msg_null);
+    __kmp_def_allocator = omp_default_mem_alloc;
+  } else if (__kmp_match_str("omp_thread_mem_alloc", buf, &next)) {
+    __kmp_msg(kmp_ms_warning, KMP_MSG(OmpNoAllocator, "omp_thread_mem_alloc"),
+              __kmp_msg_null);
+    __kmp_def_allocator = omp_default_mem_alloc;
+  }
+  buf = next;
+  SKIP_WS(buf);
+  if (*buf != '\0') {
+    KMP_WARNING(ParseExtraCharsWarn, name, buf);
+  }
+}
+
+static void __kmp_stg_print_allocator(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  if (__kmp_def_allocator == omp_default_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_default_mem_alloc");
+  } else if (__kmp_def_allocator == omp_high_bw_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_high_bw_mem_alloc");
+  } else if (__kmp_def_allocator == omp_large_cap_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_large_cap_mem_alloc");
+  } else if (__kmp_def_allocator == omp_const_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_const_mem_alloc");
+  } else if (__kmp_def_allocator == omp_low_lat_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_low_lat_mem_alloc");
+  } else if (__kmp_def_allocator == omp_cgroup_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_cgroup_mem_alloc");
+  } else if (__kmp_def_allocator == omp_pteam_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_pteam_mem_alloc");
+  } else if (__kmp_def_allocator == omp_thread_mem_alloc) {
+    __kmp_stg_print_str(buffer, name, "omp_thread_mem_alloc");
+  }
+}
+
+// -----------------------------------------------------------------------------
+// OMP_DYNAMIC
+
+static void __kmp_stg_parse_omp_dynamic(char const *name, char const *value,
+                                        void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_global.g.g_dynamic));
+} // __kmp_stg_parse_omp_dynamic
+
+static void __kmp_stg_print_omp_dynamic(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_global.g.g_dynamic);
+} // __kmp_stg_print_omp_dynamic
+
+static void __kmp_stg_parse_kmp_dynamic_mode(char const *name,
+                                             char const *value, void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    __kmp_env_toPrint(name, 0);
+    return;
+  }
+#ifdef USE_LOAD_BALANCE
+  else if (__kmp_str_match("load balance", 2, value) ||
+           __kmp_str_match("load_balance", 2, value) ||
+           __kmp_str_match("load-balance", 2, value) ||
+           __kmp_str_match("loadbalance", 2, value) ||
+           __kmp_str_match("balance", 1, value)) {
+    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
+  }
+#endif /* USE_LOAD_BALANCE */
+  else if (__kmp_str_match("thread limit", 1, value) ||
+           __kmp_str_match("thread_limit", 1, value) ||
+           __kmp_str_match("thread-limit", 1, value) ||
+           __kmp_str_match("threadlimit", 1, value) ||
+           __kmp_str_match("limit", 2, value)) {
+    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
+  } else if (__kmp_str_match("random", 1, value)) {
+    __kmp_global.g.g_dynamic_mode = dynamic_random;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} //__kmp_stg_parse_kmp_dynamic_mode
+
+static void __kmp_stg_print_kmp_dynamic_mode(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+#if KMP_DEBUG
+  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
+    __kmp_str_buf_print(buffer, "   %s: %s \n", name, KMP_I18N_STR(NotDefined));
+  }
+#ifdef USE_LOAD_BALANCE
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
+    __kmp_stg_print_str(buffer, name, "load balance");
+  }
+#endif /* USE_LOAD_BALANCE */
+  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
+    __kmp_stg_print_str(buffer, name, "thread limit");
+  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
+    __kmp_stg_print_str(buffer, name, "random");
+  } else {
+    KMP_ASSERT(0);
+  }
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_kmp_dynamic_mode
+
+#ifdef USE_LOAD_BALANCE
+
+// -----------------------------------------------------------------------------
+// KMP_LOAD_BALANCE_INTERVAL
+
+static void __kmp_stg_parse_ld_balance_interval(char const *name,
+                                                char const *value, void *data) {
+  double interval = __kmp_convert_to_double(value);
+  if (interval >= 0) {
+    __kmp_load_balance_interval = interval;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} // __kmp_stg_parse_load_balance_interval
+
+static void __kmp_stg_print_ld_balance_interval(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+#if KMP_DEBUG
+  __kmp_str_buf_print(buffer, "   %s=%8.6f\n", name,
+                      __kmp_load_balance_interval);
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_load_balance_interval
+
+#endif /* USE_LOAD_BALANCE */
+
+// -----------------------------------------------------------------------------
+// KMP_INIT_AT_FORK
+
+static void __kmp_stg_parse_init_at_fork(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_need_register_atfork);
+  if (__kmp_need_register_atfork) {
+    __kmp_need_register_atfork_specified = TRUE;
+  }
+} // __kmp_stg_parse_init_at_fork
+
+static void __kmp_stg_print_init_at_fork(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_need_register_atfork_specified);
+} // __kmp_stg_print_init_at_fork
+
+// -----------------------------------------------------------------------------
+// KMP_SCHEDULE
+
+static void __kmp_stg_parse_schedule(char const *name, char const *value,
+                                     void *data) {
+
+  if (value != NULL) {
+    size_t length = KMP_STRLEN(value);
+    if (length > INT_MAX) {
+      KMP_WARNING(LongValue, name);
+    } else {
+      const char *semicolon;
+      if (value[length - 1] == '"' || value[length - 1] == '\'')
+        KMP_WARNING(UnbalancedQuotes, name);
+      do {
+        char sentinel;
+
+        semicolon = strchr(value, ';');
+        if (*value && semicolon != value) {
+          const char *comma = strchr(value, ',');
+
+          if (comma) {
+            ++comma;
+            sentinel = ',';
+          } else
+            sentinel = ';';
+          if (!__kmp_strcasecmp_with_sentinel("static", value, sentinel)) {
+            if (!__kmp_strcasecmp_with_sentinel("greedy", comma, ';')) {
+              __kmp_static = kmp_sch_static_greedy;
+              continue;
+            } else if (!__kmp_strcasecmp_with_sentinel("balanced", comma,
+                                                       ';')) {
+              __kmp_static = kmp_sch_static_balanced;
+              continue;
+            }
+          } else if (!__kmp_strcasecmp_with_sentinel("guided", value,
+                                                     sentinel)) {
+            if (!__kmp_strcasecmp_with_sentinel("iterative", comma, ';')) {
+              __kmp_guided = kmp_sch_guided_iterative_chunked;
+              continue;
+            } else if (!__kmp_strcasecmp_with_sentinel("analytical", comma,
+                                                       ';')) {
+              /* analytical not allowed for too many threads */
+              __kmp_guided = kmp_sch_guided_analytical_chunked;
+              continue;
+            }
+          }
+          KMP_WARNING(InvalidClause, name, value);
+        } else
+          KMP_WARNING(EmptyClause, name);
+      } while ((value = semicolon ? semicolon + 1 : NULL));
+    }
+  }
+
+} // __kmp_stg_parse__schedule
+
+static void __kmp_stg_print_schedule(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  if (__kmp_static == kmp_sch_static_greedy) {
+    __kmp_str_buf_print(buffer, "%s", "static,greedy");
+  } else if (__kmp_static == kmp_sch_static_balanced) {
+    __kmp_str_buf_print(buffer, "%s", "static,balanced");
+  }
+  if (__kmp_guided == kmp_sch_guided_iterative_chunked) {
+    __kmp_str_buf_print(buffer, ";%s'\n", "guided,iterative");
+  } else if (__kmp_guided == kmp_sch_guided_analytical_chunked) {
+    __kmp_str_buf_print(buffer, ";%s'\n", "guided,analytical");
+  }
+} // __kmp_stg_print_schedule
+
+// -----------------------------------------------------------------------------
+// OMP_SCHEDULE
+
+static inline void __kmp_omp_schedule_restore() {
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+  __kmp_chunk = 0;
+  __kmp_sched = kmp_sch_default;
+}
+
+// if parse_hier = true:
+//    Parse [HW,][modifier:]kind[,chunk]
+// else:
+//    Parse [modifier:]kind[,chunk]
+static const char *__kmp_parse_single_omp_schedule(const char *name,
+                                                   const char *value,
+                                                   bool parse_hier = false) {
+  /* get the specified scheduling style */
+  const char *ptr = value;
+  const char *delim;
+  int chunk = 0;
+  enum sched_type sched = kmp_sch_default;
+  if (*ptr == '\0')
+    return NULL;
+  delim = ptr;
+  while (*delim != ',' && *delim != ':' && *delim != '\0')
+    delim++;
+#if KMP_USE_HIER_SCHED
+  kmp_hier_layer_e layer = kmp_hier_layer_e::LAYER_THREAD;
+  if (parse_hier) {
+    if (*delim == ',') {
+      if (!__kmp_strcasecmp_with_sentinel("L1", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L1;
+      } else if (!__kmp_strcasecmp_with_sentinel("L2", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L2;
+      } else if (!__kmp_strcasecmp_with_sentinel("L3", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_L3;
+      } else if (!__kmp_strcasecmp_with_sentinel("NUMA", ptr, ',')) {
+        layer = kmp_hier_layer_e::LAYER_NUMA;
+      }
+    }
+    if (layer != kmp_hier_layer_e::LAYER_THREAD && *delim != ',') {
+      // If there is no comma after the layer, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    } else if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    }
+  }
+#endif // KMP_USE_HIER_SCHED
+  // Read in schedule modifier if specified
+  enum sched_type sched_modifier = (enum sched_type)0;
+  if (*delim == ':') {
+    if (!__kmp_strcasecmp_with_sentinel("monotonic", ptr, *delim)) {
+      sched_modifier = sched_type::kmp_sch_modifier_monotonic;
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    } else if (!__kmp_strcasecmp_with_sentinel("nonmonotonic", ptr, *delim)) {
+      sched_modifier = sched_type::kmp_sch_modifier_nonmonotonic;
+      ptr = ++delim;
+      while (*delim != ',' && *delim != ':' && *delim != '\0')
+        delim++;
+    } else if (!parse_hier) {
+      // If there is no proper schedule modifier, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    }
+  }
+  // Read in schedule kind (required)
+  if (!__kmp_strcasecmp_with_sentinel("dynamic", ptr, *delim))
+    sched = kmp_sch_dynamic_chunked;
+  else if (!__kmp_strcasecmp_with_sentinel("guided", ptr, *delim))
+    sched = kmp_sch_guided_chunked;
+  // AC: TODO: probably remove TRAPEZOIDAL (OMP 3.0 does not allow it)
+  else if (!__kmp_strcasecmp_with_sentinel("auto", ptr, *delim))
+    sched = kmp_sch_auto;
+  else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", ptr, *delim))
+    sched = kmp_sch_trapezoidal;
+  else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim))
+    sched = kmp_sch_static;
+#if KMP_STATIC_STEAL_ENABLED
+  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim))
+    sched = kmp_sch_static_steal;
+#endif
+  else {
+    // If there is no proper schedule kind, then this schedule is invalid
+    KMP_WARNING(StgInvalidValue, name, value);
+    __kmp_omp_schedule_restore();
+    return NULL;
+  }
+
+  // Read in schedule chunk size if specified
+  if (*delim == ',') {
+    ptr = delim + 1;
+    SKIP_WS(ptr);
+    if (!isdigit(*ptr)) {
+      // If there is no chunk after comma, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    }
+    SKIP_DIGITS(ptr);
+    // auto schedule should not specify chunk size
+    if (sched == kmp_sch_auto) {
+      __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, delim),
+                __kmp_msg_null);
+    } else {
+      if (sched == kmp_sch_static)
+        sched = kmp_sch_static_chunked;
+      chunk = __kmp_str_to_int(delim + 1, *ptr);
+      if (chunk < 1) {
+        chunk = KMP_DEFAULT_CHUNK;
+        __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, delim),
+                  __kmp_msg_null);
+        KMP_INFORM(Using_int_Value, name, __kmp_chunk);
+        // AC: next block commented out until KMP_DEFAULT_CHUNK != KMP_MIN_CHUNK
+        // (to improve code coverage :)
+        // The default chunk size is 1 according to standard, thus making
+        // KMP_MIN_CHUNK not 1 we would introduce mess:
+        // wrong chunk becomes 1, but it will be impossible to explicitly set
+        // to 1 because it becomes KMP_MIN_CHUNK...
+        // } else if ( chunk < KMP_MIN_CHUNK ) {
+        //   chunk = KMP_MIN_CHUNK;
+      } else if (chunk > KMP_MAX_CHUNK) {
+        chunk = KMP_MAX_CHUNK;
+        __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, delim),
+                  __kmp_msg_null);
+        KMP_INFORM(Using_int_Value, name, chunk);
+      }
+    }
+  } else {
+    ptr = delim;
+  }
+
+  SCHEDULE_SET_MODIFIERS(sched, sched_modifier);
+
+#if KMP_USE_HIER_SCHED
+  if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+    __kmp_hier_scheds.append(sched, chunk, layer);
+  } else
+#endif
+  {
+    __kmp_chunk = chunk;
+    __kmp_sched = sched;
+  }
+  return ptr;
+}
+
+static void __kmp_stg_parse_omp_schedule(char const *name, char const *value,
+                                         void *data) {
+  size_t length;
+  const char *ptr = value;
+  SKIP_WS(ptr);
+  if (value) {
+    length = KMP_STRLEN(value);
+    if (length) {
+      if (value[length - 1] == '"' || value[length - 1] == '\'')
+        KMP_WARNING(UnbalancedQuotes, name);
+/* get the specified scheduling style */
+#if KMP_USE_HIER_SCHED
+      if (!__kmp_strcasecmp_with_sentinel("EXPERIMENTAL", ptr, ' ')) {
+        SKIP_TOKEN(ptr);
+        SKIP_WS(ptr);
+        while ((ptr = __kmp_parse_single_omp_schedule(name, ptr, true))) {
+          while (*ptr == ' ' || *ptr == '\t' || *ptr == ':')
+            ptr++;
+          if (*ptr == '\0')
+            break;
+        }
+      } else
+#endif
+        __kmp_parse_single_omp_schedule(name, ptr);
+    } else
+      KMP_WARNING(EmptyString, name);
+  }
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.sort();
+#endif
+  K_DIAG(1, ("__kmp_static == %d\n", __kmp_static))
+  K_DIAG(1, ("__kmp_guided == %d\n", __kmp_guided))
+  K_DIAG(1, ("__kmp_sched == %d\n", __kmp_sched))
+  K_DIAG(1, ("__kmp_chunk == %d\n", __kmp_chunk))
+} // __kmp_stg_parse_omp_schedule
+
+static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  enum sched_type sched = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
+  if (SCHEDULE_HAS_MONOTONIC(__kmp_sched)) {
+    __kmp_str_buf_print(buffer, "monotonic:");
+  } else if (SCHEDULE_HAS_NONMONOTONIC(__kmp_sched)) {
+    __kmp_str_buf_print(buffer, "nonmonotonic:");
+  }
+  if (__kmp_chunk) {
+    switch (sched) {
+    case kmp_sch_dynamic_chunked:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "dynamic", __kmp_chunk);
+      break;
+    case kmp_sch_guided_iterative_chunked:
+    case kmp_sch_guided_analytical_chunked:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "guided", __kmp_chunk);
+      break;
+    case kmp_sch_trapezoidal:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "trapezoidal", __kmp_chunk);
+      break;
+    case kmp_sch_static:
+    case kmp_sch_static_chunked:
+    case kmp_sch_static_balanced:
+    case kmp_sch_static_greedy:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "static", __kmp_chunk);
+      break;
+    case kmp_sch_static_steal:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "static_steal", __kmp_chunk);
+      break;
+    case kmp_sch_auto:
+      __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
+      break;
+    }
+  } else {
+    switch (sched) {
+    case kmp_sch_dynamic_chunked:
+      __kmp_str_buf_print(buffer, "%s'\n", "dynamic");
+      break;
+    case kmp_sch_guided_iterative_chunked:
+    case kmp_sch_guided_analytical_chunked:
+      __kmp_str_buf_print(buffer, "%s'\n", "guided");
+      break;
+    case kmp_sch_trapezoidal:
+      __kmp_str_buf_print(buffer, "%s'\n", "trapezoidal");
+      break;
+    case kmp_sch_static:
+    case kmp_sch_static_chunked:
+    case kmp_sch_static_balanced:
+    case kmp_sch_static_greedy:
+      __kmp_str_buf_print(buffer, "%s'\n", "static");
+      break;
+    case kmp_sch_static_steal:
+      __kmp_str_buf_print(buffer, "%s'\n", "static_steal");
+      break;
+    case kmp_sch_auto:
+      __kmp_str_buf_print(buffer, "%s'\n", "auto");
+      break;
+    }
+  }
+} // __kmp_stg_print_omp_schedule
+
+#if KMP_USE_HIER_SCHED
+// -----------------------------------------------------------------------------
+// KMP_DISP_HAND_THREAD
+static void __kmp_stg_parse_kmp_hand_thread(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_dispatch_hand_threading));
+} // __kmp_stg_parse_kmp_hand_thread
+
+static void __kmp_stg_print_kmp_hand_thread(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_dispatch_hand_threading);
+} // __kmp_stg_print_kmp_hand_thread
+#endif
+
+// -----------------------------------------------------------------------------
+// KMP_ATOMIC_MODE
+
+static void __kmp_stg_parse_atomic_mode(char const *name, char const *value,
+                                        void *data) {
+  // Modes: 0 -- do not change default; 1 -- Intel perf mode, 2 -- GOMP
+  // compatibility mode.
+  int mode = 0;
+  int max = 1;
+#ifdef KMP_GOMP_COMPAT
+  max = 2;
+#endif /* KMP_GOMP_COMPAT */
+  __kmp_stg_parse_int(name, value, 0, max, &mode);
+  // TODO; parse_int is not very suitable for this case. In case of overflow it
+  // is better to use
+  // 0 rather that max value.
+  if (mode > 0) {
+    __kmp_atomic_mode = mode;
+  }
+} // __kmp_stg_parse_atomic_mode
+
+static void __kmp_stg_print_atomic_mode(kmp_str_buf_t *buffer, char const *name,
+                                        void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_atomic_mode);
+} // __kmp_stg_print_atomic_mode
+
+// -----------------------------------------------------------------------------
+// KMP_CONSISTENCY_CHECK
+
+static void __kmp_stg_parse_consistency_check(char const *name,
+                                              char const *value, void *data) {
+  if (!__kmp_strcasecmp_with_sentinel("all", value, 0)) {
+    // Note, this will not work from kmp_set_defaults because th_cons stack was
+    // not allocated
+    // for existed thread(s) thus the first __kmp_push_<construct> will break
+    // with assertion.
+    // TODO: allocate th_cons if called from kmp_set_defaults.
+    __kmp_env_consistency_check = TRUE;
+  } else if (!__kmp_strcasecmp_with_sentinel("none", value, 0)) {
+    __kmp_env_consistency_check = FALSE;
+  } else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+} // __kmp_stg_parse_consistency_check
+
+static void __kmp_stg_print_consistency_check(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+#if KMP_DEBUG
+  const char *value = NULL;
+
+  if (__kmp_env_consistency_check) {
+    value = "all";
+  } else {
+    value = "none";
+  }
+
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+#endif /* KMP_DEBUG */
+} // __kmp_stg_print_consistency_check
+
+#if USE_ITT_BUILD
+// -----------------------------------------------------------------------------
+// KMP_ITT_PREPARE_DELAY
+
+#if USE_ITT_NOTIFY
+
+static void __kmp_stg_parse_itt_prepare_delay(char const *name,
+                                              char const *value, void *data) {
+  // Experimental code: KMP_ITT_PREPARE_DELAY specifies numbert of loop
+  // iterations.
+  int delay = 0;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &delay);
+  __kmp_itt_prepare_delay = delay;
+} // __kmp_str_parse_itt_prepare_delay
+
+static void __kmp_stg_print_itt_prepare_delay(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_uint64(buffer, name, __kmp_itt_prepare_delay);
+
+} // __kmp_str_print_itt_prepare_delay
+
+#endif // USE_ITT_NOTIFY
+#endif /* USE_ITT_BUILD */
+
+// -----------------------------------------------------------------------------
+// KMP_MALLOC_POOL_INCR
+
+static void __kmp_stg_parse_malloc_pool_incr(char const *name,
+                                             char const *value, void *data) {
+  __kmp_stg_parse_size(name, value, KMP_MIN_MALLOC_POOL_INCR,
+                       KMP_MAX_MALLOC_POOL_INCR, NULL, &__kmp_malloc_pool_incr,
+                       1);
+} // __kmp_stg_parse_malloc_pool_incr
+
+static void __kmp_stg_print_malloc_pool_incr(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_size(buffer, name, __kmp_malloc_pool_incr);
+
+} // _kmp_stg_print_malloc_pool_incr
+
+#ifdef KMP_DEBUG
+
+// -----------------------------------------------------------------------------
+// KMP_PAR_RANGE
+
+static void __kmp_stg_parse_par_range_env(char const *name, char const *value,
+                                          void *data) {
+  __kmp_stg_parse_par_range(name, value, &__kmp_par_range,
+                            __kmp_par_range_routine, __kmp_par_range_filename,
+                            &__kmp_par_range_lb, &__kmp_par_range_ub);
+} // __kmp_stg_parse_par_range_env
+
+static void __kmp_stg_print_par_range_env(kmp_str_buf_t *buffer,
+                                          char const *name, void *data) {
+  if (__kmp_par_range != 0) {
+    __kmp_stg_print_str(buffer, name, par_range_to_print);
+  }
+} // __kmp_stg_print_par_range_env
+
+#endif
+
+// -----------------------------------------------------------------------------
+// KMP_GTID_MODE
+
+static void __kmp_stg_parse_gtid_mode(char const *name, char const *value,
+                                      void *data) {
+  // Modes:
+  //   0 -- do not change default
+  //   1 -- sp search
+  //   2 -- use "keyed" TLS var, i.e.
+  //        pthread_getspecific(Linux* OS/OS X*) or TlsGetValue(Windows* OS)
+  //   3 -- __declspec(thread) TLS var in tdata section
+  int mode = 0;
+  int max = 2;
+#ifdef KMP_TDATA_GTID
+  max = 3;
+#endif /* KMP_TDATA_GTID */
+  __kmp_stg_parse_int(name, value, 0, max, &mode);
+  // TODO; parse_int is not very suitable for this case. In case of overflow it
+  // is better to use 0 rather that max value.
+  if (mode == 0) {
+    __kmp_adjust_gtid_mode = TRUE;
+  } else {
+    __kmp_gtid_mode = mode;
+    __kmp_adjust_gtid_mode = FALSE;
+  }
+} // __kmp_str_parse_gtid_mode
+
+static void __kmp_stg_print_gtid_mode(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  if (__kmp_adjust_gtid_mode) {
+    __kmp_stg_print_int(buffer, name, 0);
+  } else {
+    __kmp_stg_print_int(buffer, name, __kmp_gtid_mode);
+  }
+} // __kmp_stg_print_gtid_mode
+
+// -----------------------------------------------------------------------------
+// KMP_NUM_LOCKS_IN_BLOCK
+
+static void __kmp_stg_parse_lock_block(char const *name, char const *value,
+                                       void *data) {
+  __kmp_stg_parse_int(name, value, 0, KMP_INT_MAX, &__kmp_num_locks_in_block);
+} // __kmp_str_parse_lock_block
+
+static void __kmp_stg_print_lock_block(kmp_str_buf_t *buffer, char const *name,
+                                       void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_num_locks_in_block);
+} // __kmp_stg_print_lock_block
+
+// -----------------------------------------------------------------------------
+// KMP_LOCK_KIND
+
+#if KMP_USE_DYNAMIC_LOCK
+#define KMP_STORE_LOCK_SEQ(a) (__kmp_user_lock_seq = lockseq_##a)
+#else
+#define KMP_STORE_LOCK_SEQ(a)
+#endif
+
+static void __kmp_stg_parse_lock_kind(char const *name, char const *value,
+                                      void *data) {
+  if (__kmp_init_user_locks) {
+    KMP_WARNING(EnvLockWarn, name);
+    return;
+  }
+
+  if (__kmp_str_match("tas", 2, value) ||
+      __kmp_str_match("test and set", 2, value) ||
+      __kmp_str_match("test_and_set", 2, value) ||
+      __kmp_str_match("test-and-set", 2, value) ||
+      __kmp_str_match("test andset", 2, value) ||
+      __kmp_str_match("test_andset", 2, value) ||
+      __kmp_str_match("test-andset", 2, value) ||
+      __kmp_str_match("testand set", 2, value) ||
+      __kmp_str_match("testand_set", 2, value) ||
+      __kmp_str_match("testand-set", 2, value) ||
+      __kmp_str_match("testandset", 2, value)) {
+    __kmp_user_lock_kind = lk_tas;
+    KMP_STORE_LOCK_SEQ(tas);
+  }
+#if KMP_USE_FUTEX
+  else if (__kmp_str_match("futex", 1, value)) {
+    if (__kmp_futex_determine_capable()) {
+      __kmp_user_lock_kind = lk_futex;
+      KMP_STORE_LOCK_SEQ(futex);
+    } else {
+      KMP_WARNING(FutexNotSupported, name, value);
+    }
+  }
+#endif
+  else if (__kmp_str_match("ticket", 2, value)) {
+    __kmp_user_lock_kind = lk_ticket;
+    KMP_STORE_LOCK_SEQ(ticket);
+  } else if (__kmp_str_match("queuing", 1, value) ||
+             __kmp_str_match("queue", 1, value)) {
+    __kmp_user_lock_kind = lk_queuing;
+    KMP_STORE_LOCK_SEQ(queuing);
+  } else if (__kmp_str_match("drdpa ticket", 1, value) ||
+             __kmp_str_match("drdpa_ticket", 1, value) ||
+             __kmp_str_match("drdpa-ticket", 1, value) ||
+             __kmp_str_match("drdpaticket", 1, value) ||
+             __kmp_str_match("drdpa", 1, value)) {
+    __kmp_user_lock_kind = lk_drdpa;
+    KMP_STORE_LOCK_SEQ(drdpa);
+  }
+#if KMP_USE_ADAPTIVE_LOCKS
+  else if (__kmp_str_match("adaptive", 1, value)) {
+    if (__kmp_cpuinfo.rtm) { // ??? Is cpuinfo available here?
+      __kmp_user_lock_kind = lk_adaptive;
+      KMP_STORE_LOCK_SEQ(adaptive);
+    } else {
+      KMP_WARNING(AdaptiveNotSupported, name, value);
+      __kmp_user_lock_kind = lk_queuing;
+      KMP_STORE_LOCK_SEQ(queuing);
+    }
+  }
+#endif // KMP_USE_ADAPTIVE_LOCKS
+#if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
+  else if (__kmp_str_match("rtm", 1, value)) {
+    if (__kmp_cpuinfo.rtm) {
+      __kmp_user_lock_kind = lk_rtm;
+      KMP_STORE_LOCK_SEQ(rtm);
+    } else {
+      KMP_WARNING(AdaptiveNotSupported, name, value);
+      __kmp_user_lock_kind = lk_queuing;
+      KMP_STORE_LOCK_SEQ(queuing);
+    }
+  } else if (__kmp_str_match("hle", 1, value)) {
+    __kmp_user_lock_kind = lk_hle;
+    KMP_STORE_LOCK_SEQ(hle);
+  }
+#endif
+  else {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+}
+
+static void __kmp_stg_print_lock_kind(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  const char *value = NULL;
+
+  switch (__kmp_user_lock_kind) {
+  case lk_default:
+    value = "default";
+    break;
+
+  case lk_tas:
+    value = "tas";
+    break;
+
+#if KMP_USE_FUTEX
+  case lk_futex:
+    value = "futex";
+    break;
+#endif
+
+#if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
+  case lk_rtm:
+    value = "rtm";
+    break;
+
+  case lk_hle:
+    value = "hle";
+    break;
+#endif
+
+  case lk_ticket:
+    value = "ticket";
+    break;
+
+  case lk_queuing:
+    value = "queuing";
+    break;
+
+  case lk_drdpa:
+    value = "drdpa";
+    break;
+#if KMP_USE_ADAPTIVE_LOCKS
+  case lk_adaptive:
+    value = "adaptive";
+    break;
+#endif
+  }
+
+  if (value != NULL) {
+    __kmp_stg_print_str(buffer, name, value);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// KMP_SPIN_BACKOFF_PARAMS
+
+// KMP_SPIN_BACKOFF_PARAMS=max_backoff[,min_tick] (max backoff size, min tick
+// for machine pause)
+static void __kmp_stg_parse_spin_backoff_params(const char *name,
+                                                const char *value, void *data) {
+  const char *next = value;
+
+  int total = 0; // Count elements that were set. It'll be used as an array size
+  int prev_comma = FALSE; // For correct processing sequential commas
+  int i;
+
+  kmp_uint32 max_backoff = __kmp_spin_backoff_params.max_backoff;
+  kmp_uint32 min_tick = __kmp_spin_backoff_params.min_tick;
+
+  // Run only 3 iterations because it is enough to read two values or find a
+  // syntax error
+  for (i = 0; i < 3; i++) {
+    SKIP_WS(next);
+
+    if (*next == '\0') {
+      break;
+    }
+    // Next character is not an integer or not a comma OR number of values > 2
+    // => end of list
+    if (((*next < '0' || *next > '9') && *next != ',') || total > 2) {
+      KMP_WARNING(EnvSyntaxError, name, value);
+      return;
+    }
+    // The next character is ','
+    if (*next == ',') {
+      // ',' is the fisrt character
+      if (total == 0 || prev_comma) {
+        total++;
+      }
+      prev_comma = TRUE;
+      next++; // skip ','
+      SKIP_WS(next);
+    }
+    // Next character is a digit
+    if (*next >= '0' && *next <= '9') {
+      int num;
+      const char *buf = next;
+      char const *msg = NULL;
+      prev_comma = FALSE;
+      SKIP_DIGITS(next);
+      total++;
+
+      const char *tmp = next;
+      SKIP_WS(tmp);
+      if ((*next == ' ' || *next == '\t') && (*tmp >= '0' && *tmp <= '9')) {
+        KMP_WARNING(EnvSpacesNotAllowed, name, value);
+        return;
+      }
+
+      num = __kmp_str_to_int(buf, *next);
+      if (num <= 0) { // The number of retries should be > 0
+        msg = KMP_I18N_STR(ValueTooSmall);
+        num = 1;
+      } else if (num > KMP_INT_MAX) {
+        msg = KMP_I18N_STR(ValueTooLarge);
+        num = KMP_INT_MAX;
+      }
+      if (msg != NULL) {
+        // Message is not empty. Print warning.
+        KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+        KMP_INFORM(Using_int_Value, name, num);
+      }
+      if (total == 1) {
+        max_backoff = num;
+      } else if (total == 2) {
+        min_tick = num;
+      }
+    }
+  }
+  KMP_DEBUG_ASSERT(total > 0);
+  if (total <= 0) {
+    KMP_WARNING(EnvSyntaxError, name, value);
+    return;
+  }
+  __kmp_spin_backoff_params.max_backoff = max_backoff;
+  __kmp_spin_backoff_params.min_tick = min_tick;
+}
+
+static void __kmp_stg_print_spin_backoff_params(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  __kmp_str_buf_print(buffer, "%d,%d'\n", __kmp_spin_backoff_params.max_backoff,
+                      __kmp_spin_backoff_params.min_tick);
+}
+
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// -----------------------------------------------------------------------------
+// KMP_ADAPTIVE_LOCK_PROPS, KMP_SPECULATIVE_STATSFILE
+
+// Parse out values for the tunable parameters from a string of the form
+// KMP_ADAPTIVE_LOCK_PROPS=max_soft_retries[,max_badness]
+static void __kmp_stg_parse_adaptive_lock_props(const char *name,
+                                                const char *value, void *data) {
+  int max_retries = 0;
+  int max_badness = 0;
+
+  const char *next = value;
+
+  int total = 0; // Count elements that were set. It'll be used as an array size
+  int prev_comma = FALSE; // For correct processing sequential commas
+  int i;
+
+  // Save values in the structure __kmp_speculative_backoff_params
+  // Run only 3 iterations because it is enough to read two values or find a
+  // syntax error
+  for (i = 0; i < 3; i++) {
+    SKIP_WS(next);
+
+    if (*next == '\0') {
+      break;
+    }
+    // Next character is not an integer or not a comma OR number of values > 2
+    // => end of list
+    if (((*next < '0' || *next > '9') && *next != ',') || total > 2) {
+      KMP_WARNING(EnvSyntaxError, name, value);
+      return;
+    }
+    // The next character is ','
+    if (*next == ',') {
+      // ',' is the fisrt character
+      if (total == 0 || prev_comma) {
+        total++;
+      }
+      prev_comma = TRUE;
+      next++; // skip ','
+      SKIP_WS(next);
+    }
+    // Next character is a digit
+    if (*next >= '0' && *next <= '9') {
+      int num;
+      const char *buf = next;
+      char const *msg = NULL;
+      prev_comma = FALSE;
+      SKIP_DIGITS(next);
+      total++;
+
+      const char *tmp = next;
+      SKIP_WS(tmp);
+      if ((*next == ' ' || *next == '\t') && (*tmp >= '0' && *tmp <= '9')) {
+        KMP_WARNING(EnvSpacesNotAllowed, name, value);
+        return;
+      }
+
+      num = __kmp_str_to_int(buf, *next);
+      if (num < 0) { // The number of retries should be >= 0
+        msg = KMP_I18N_STR(ValueTooSmall);
+        num = 1;
+      } else if (num > KMP_INT_MAX) {
+        msg = KMP_I18N_STR(ValueTooLarge);
+        num = KMP_INT_MAX;
+      }
+      if (msg != NULL) {
+        // Message is not empty. Print warning.
+        KMP_WARNING(ParseSizeIntWarn, name, value, msg);
+        KMP_INFORM(Using_int_Value, name, num);
+      }
+      if (total == 1) {
+        max_retries = num;
+      } else if (total == 2) {
+        max_badness = num;
+      }
+    }
+  }
+  KMP_DEBUG_ASSERT(total > 0);
+  if (total <= 0) {
+    KMP_WARNING(EnvSyntaxError, name, value);
+    return;
+  }
+  __kmp_adaptive_backoff_params.max_soft_retries = max_retries;
+  __kmp_adaptive_backoff_params.max_badness = max_badness;
+}
+
+static void __kmp_stg_print_adaptive_lock_props(kmp_str_buf_t *buffer,
+                                                char const *name, void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_NAME_EX(name);
+  } else {
+    __kmp_str_buf_print(buffer, "   %s='", name);
+  }
+  __kmp_str_buf_print(buffer, "%d,%d'\n",
+                      __kmp_adaptive_backoff_params.max_soft_retries,
+                      __kmp_adaptive_backoff_params.max_badness);
+} // __kmp_stg_print_adaptive_lock_props
+
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+
+static void __kmp_stg_parse_speculative_statsfile(char const *name,
+                                                  char const *value,
+                                                  void *data) {
+  __kmp_stg_parse_file(name, value, "", CCAST(char**, &__kmp_speculative_statsfile));
+} // __kmp_stg_parse_speculative_statsfile
+
+static void __kmp_stg_print_speculative_statsfile(kmp_str_buf_t *buffer,
+                                                  char const *name,
+                                                  void *data) {
+  if (__kmp_str_match("-", 0, __kmp_speculative_statsfile)) {
+    __kmp_stg_print_str(buffer, name, "stdout");
+  } else {
+    __kmp_stg_print_str(buffer, name, __kmp_speculative_statsfile);
+  }
+
+} // __kmp_stg_print_speculative_statsfile
+
+#endif // KMP_DEBUG_ADAPTIVE_LOCKS
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
+
+// -----------------------------------------------------------------------------
+// KMP_HW_SUBSET (was KMP_PLACE_THREADS)
+
+// The longest observable sequense of items is
+// Socket-Node-Tile-Core-Thread
+// So, let's limit to 5 levels for now
+// The input string is usually short enough, let's use 512 limit for now
+#define MAX_T_LEVEL 5
+#define MAX_STR_LEN 512
+static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
+                                      void *data) {
+  // Value example: 1s,5c@3,2T
+  // Which means "use 1 socket, 5 cores with offset 3, 2 threads per core"
+  kmp_setting_t **rivals = (kmp_setting_t **)data;
+  if (strcmp(name, "KMP_PLACE_THREADS") == 0) {
+    KMP_INFORM(EnvVarDeprecated, name, "KMP_HW_SUBSET");
+  }
+  if (__kmp_stg_check_rivals(name, value, rivals)) {
+    return;
+  }
+
+  char *components[MAX_T_LEVEL];
+  char const *digits = "0123456789";
+  char input[MAX_STR_LEN];
+  size_t len = 0, mlen = MAX_STR_LEN;
+  int level = 0;
+  // Canonize the string (remove spaces, unify delimiters, etc.)
+  char *pos = CCAST(char *, value);
+  while (*pos && mlen) {
+    if (*pos != ' ') { // skip spaces
+      if (len == 0 && *pos == ':') {
+        __kmp_hws_abs_flag = 1; // if the first symbol is ":", skip it
+      } else {
+        input[len] = toupper(*pos);
+        if (input[len] == 'X')
+          input[len] = ','; // unify delimiters of levels
+        if (input[len] == 'O' && strchr(digits, *(pos + 1)))
+          input[len] = '@'; // unify delimiters of offset
+        len++;
+      }
+    }
+    mlen--;
+    pos++;
+  }
+  if (len == 0 || mlen == 0)
+    goto err; // contents is either empty or too long
+  input[len] = '\0';
+  __kmp_hws_requested = 1; // mark that subset requested
+  // Split by delimiter
+  pos = input;
+  components[level++] = pos;
+  while ((pos = strchr(pos, ','))) {
+    if (level >= MAX_T_LEVEL)
+      goto err; // too many components provided
+    *pos = '\0'; // modify input and avoid more copying
+    components[level++] = ++pos; // expect something after ","
+  }
+  // Check each component
+  for (int i = 0; i < level; ++i) {
+    int offset = 0;
+    int num = atoi(components[i]); // each component should start with a number
+    if ((pos = strchr(components[i], '@'))) {
+      offset = atoi(pos + 1); // save offset
+      *pos = '\0'; // cut the offset from the component
+    }
+    pos = components[i] + strspn(components[i], digits);
+    if (pos == components[i])
+      goto err;
+    // detect the component type
+    switch (*pos) {
+    case 'S': // Socket
+      if (__kmp_hws_socket.num > 0)
+        goto err; // duplicate is not allowed
+      __kmp_hws_socket.num = num;
+      __kmp_hws_socket.offset = offset;
+      break;
+    case 'N': // NUMA Node
+      if (__kmp_hws_node.num > 0)
+        goto err; // duplicate is not allowed
+      __kmp_hws_node.num = num;
+      __kmp_hws_node.offset = offset;
+      break;
+    case 'L': // Cache
+      if (*(pos + 1) == '2') { // L2 - Tile
+        if (__kmp_hws_tile.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_tile.num = num;
+        __kmp_hws_tile.offset = offset;
+      } else if (*(pos + 1) == '3') { // L3 - Socket
+        if (__kmp_hws_socket.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_socket.num = num;
+        __kmp_hws_socket.offset = offset;
+      } else if (*(pos + 1) == '1') { // L1 - Core
+        if (__kmp_hws_core.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_core.num = num;
+        __kmp_hws_core.offset = offset;
+      }
+      break;
+    case 'C': // Core (or Cache?)
+      if (*(pos + 1) != 'A') {
+        if (__kmp_hws_core.num > 0)
+          goto err; // duplicate is not allowed
+        __kmp_hws_core.num = num;
+        __kmp_hws_core.offset = offset;
+      } else { // Cache
+        char *d = pos + strcspn(pos, digits); // find digit
+        if (*d == '2') { // L2 - Tile
+          if (__kmp_hws_tile.num > 0)
+            goto err; // duplicate is not allowed
+          __kmp_hws_tile.num = num;
+          __kmp_hws_tile.offset = offset;
+        } else if (*d == '3') { // L3 - Socket
+          if (__kmp_hws_socket.num > 0)
+            goto err; // duplicate is not allowed
+          __kmp_hws_socket.num = num;
+          __kmp_hws_socket.offset = offset;
+        } else if (*d == '1') { // L1 - Core
+          if (__kmp_hws_core.num > 0)
+            goto err; // duplicate is not allowed
+          __kmp_hws_core.num = num;
+          __kmp_hws_core.offset = offset;
+        } else {
+          goto err;
+        }
+      }
+      break;
+    case 'T': // Thread
+      if (__kmp_hws_proc.num > 0)
+        goto err; // duplicate is not allowed
+      __kmp_hws_proc.num = num;
+      __kmp_hws_proc.offset = offset;
+      break;
+    default:
+      goto err;
+    }
+  }
+  return;
+err:
+  KMP_WARNING(AffHWSubsetInvalid, name, value);
+  __kmp_hws_requested = 0; // mark that subset not requested
+  return;
+}
+
+static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
+                                      void *data) {
+  if (__kmp_hws_requested) {
+    int comma = 0;
+    kmp_str_buf_t buf;
+    __kmp_str_buf_init(&buf);
+    if (__kmp_env_format)
+      KMP_STR_BUF_PRINT_NAME_EX(name);
+    else
+      __kmp_str_buf_print(buffer, "   %s='", name);
+    if (__kmp_hws_socket.num) {
+      __kmp_str_buf_print(&buf, "%ds", __kmp_hws_socket.num);
+      if (__kmp_hws_socket.offset)
+        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset);
+      comma = 1;
+    }
+    if (__kmp_hws_node.num) {
+      __kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num);
+      if (__kmp_hws_node.offset)
+        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_node.offset);
+      comma = 1;
+    }
+    if (__kmp_hws_tile.num) {
+      __kmp_str_buf_print(&buf, "%s%dL2", comma ? "," : "", __kmp_hws_tile.num);
+      if (__kmp_hws_tile.offset)
+        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_tile.offset);
+      comma = 1;
+    }
+    if (__kmp_hws_core.num) {
+      __kmp_str_buf_print(&buf, "%s%dc", comma ? "," : "", __kmp_hws_core.num);
+      if (__kmp_hws_core.offset)
+        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_core.offset);
+      comma = 1;
+    }
+    if (__kmp_hws_proc.num)
+      __kmp_str_buf_print(&buf, "%s%dt", comma ? "," : "", __kmp_hws_proc.num);
+    __kmp_str_buf_print(buffer, "%s'\n", buf.str);
+    __kmp_str_buf_free(&buf);
+  }
+}
+
+#if USE_ITT_BUILD
+// -----------------------------------------------------------------------------
+// KMP_FORKJOIN_FRAMES
+
+static void __kmp_stg_parse_forkjoin_frames(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_forkjoin_frames);
+} // __kmp_stg_parse_forkjoin_frames
+
+static void __kmp_stg_print_forkjoin_frames(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_forkjoin_frames);
+} // __kmp_stg_print_forkjoin_frames
+
+// -----------------------------------------------------------------------------
+// KMP_FORKJOIN_FRAMES_MODE
+
+static void __kmp_stg_parse_forkjoin_frames_mode(char const *name,
+                                                 char const *value,
+                                                 void *data) {
+  __kmp_stg_parse_int(name, value, 0, 3, &__kmp_forkjoin_frames_mode);
+} // __kmp_stg_parse_forkjoin_frames
+
+static void __kmp_stg_print_forkjoin_frames_mode(kmp_str_buf_t *buffer,
+                                                 char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_forkjoin_frames_mode);
+} // __kmp_stg_print_forkjoin_frames
+#endif /* USE_ITT_BUILD */
+
+// -----------------------------------------------------------------------------
+// KMP_ENABLE_TASK_THROTTLING
+
+static void __kmp_stg_parse_task_throttling(char const *name,
+                                            char const *value, void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling);
+} // __kmp_stg_parse_task_throttling
+
+
+static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling);
+} // __kmp_stg_print_task_throttling
+
+// -----------------------------------------------------------------------------
+// OMP_DISPLAY_ENV
+
+static void __kmp_stg_parse_omp_display_env(char const *name, char const *value,
+                                            void *data) {
+  if (__kmp_str_match("VERBOSE", 1, value)) {
+    __kmp_display_env_verbose = TRUE;
+  } else {
+    __kmp_stg_parse_bool(name, value, &__kmp_display_env);
+  }
+} // __kmp_stg_parse_omp_display_env
+
+static void __kmp_stg_print_omp_display_env(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  if (__kmp_display_env_verbose) {
+    __kmp_stg_print_str(buffer, name, "VERBOSE");
+  } else {
+    __kmp_stg_print_bool(buffer, name, __kmp_display_env);
+  }
+} // __kmp_stg_print_omp_display_env
+
+static void __kmp_stg_parse_omp_cancellation(char const *name,
+                                             char const *value, void *data) {
+  if (TCR_4(__kmp_init_parallel)) {
+    KMP_WARNING(EnvParallelWarn, name);
+    return;
+  } // read value before first parallel only
+  __kmp_stg_parse_bool(name, value, &__kmp_omp_cancellation);
+} // __kmp_stg_parse_omp_cancellation
+
+static void __kmp_stg_print_omp_cancellation(kmp_str_buf_t *buffer,
+                                             char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_omp_cancellation);
+} // __kmp_stg_print_omp_cancellation
+
+#if OMPT_SUPPORT
+static int __kmp_tool = 1;
+
+static void __kmp_stg_parse_omp_tool(char const *name, char const *value,
+                                     void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_tool);
+} // __kmp_stg_parse_omp_tool
+
+static void __kmp_stg_print_omp_tool(kmp_str_buf_t *buffer, char const *name,
+                                     void *data) {
+  if (__kmp_env_format) {
+    KMP_STR_BUF_PRINT_BOOL_EX(name, __kmp_tool, "enabled", "disabled");
+  } else {
+    __kmp_str_buf_print(buffer, "   %s=%s\n", name,
+                        __kmp_tool ? "enabled" : "disabled");
+  }
+} // __kmp_stg_print_omp_tool
+
+static char *__kmp_tool_libraries = NULL;
+
+static void __kmp_stg_parse_omp_tool_libraries(char const *name,
+                                               char const *value, void *data) {
+  __kmp_stg_parse_str(name, value, &__kmp_tool_libraries);
+} // __kmp_stg_parse_omp_tool_libraries
+
+static void __kmp_stg_print_omp_tool_libraries(kmp_str_buf_t *buffer,
+                                               char const *name, void *data) {
+  if (__kmp_tool_libraries)
+    __kmp_stg_print_str(buffer, name, __kmp_tool_libraries);
+  else {
+    if (__kmp_env_format) {
+      KMP_STR_BUF_PRINT_NAME;
+    } else {
+      __kmp_str_buf_print(buffer, "   %s", name);
+    }
+    __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
+  }
+} // __kmp_stg_print_omp_tool_libraries
+
+#endif
+
+// Table.
+
+static kmp_setting_t __kmp_stg_table[] = {
+
+    {"KMP_ALL_THREADS", __kmp_stg_parse_device_thread_limit, NULL, NULL, 0, 0},
+    {"KMP_BLOCKTIME", __kmp_stg_parse_blocktime, __kmp_stg_print_blocktime,
+     NULL, 0, 0},
+    {"KMP_USE_YIELD", __kmp_stg_parse_use_yield, __kmp_stg_print_use_yield,
+     NULL, 0, 0},
+    {"KMP_DUPLICATE_LIB_OK", __kmp_stg_parse_duplicate_lib_ok,
+     __kmp_stg_print_duplicate_lib_ok, NULL, 0, 0},
+    {"KMP_LIBRARY", __kmp_stg_parse_wait_policy, __kmp_stg_print_wait_policy,
+     NULL, 0, 0},
+    {"KMP_DEVICE_THREAD_LIMIT", __kmp_stg_parse_device_thread_limit,
+     __kmp_stg_print_device_thread_limit, NULL, 0, 0},
+#if KMP_USE_MONITOR
+    {"KMP_MONITOR_STACKSIZE", __kmp_stg_parse_monitor_stacksize,
+     __kmp_stg_print_monitor_stacksize, NULL, 0, 0},
+#endif
+    {"KMP_SETTINGS", __kmp_stg_parse_settings, __kmp_stg_print_settings, NULL,
+     0, 0},
+    {"KMP_STACKOFFSET", __kmp_stg_parse_stackoffset,
+     __kmp_stg_print_stackoffset, NULL, 0, 0},
+    {"KMP_STACKSIZE", __kmp_stg_parse_stacksize, __kmp_stg_print_stacksize,
+     NULL, 0, 0},
+    {"KMP_STACKPAD", __kmp_stg_parse_stackpad, __kmp_stg_print_stackpad, NULL,
+     0, 0},
+    {"KMP_VERSION", __kmp_stg_parse_version, __kmp_stg_print_version, NULL, 0,
+     0},
+    {"KMP_WARNINGS", __kmp_stg_parse_warnings, __kmp_stg_print_warnings, NULL,
+     0, 0},
+
+    {"OMP_NESTED", __kmp_stg_parse_nested, __kmp_stg_print_nested, NULL, 0, 0},
+    {"OMP_NUM_THREADS", __kmp_stg_parse_num_threads,
+     __kmp_stg_print_num_threads, NULL, 0, 0},
+    {"OMP_STACKSIZE", __kmp_stg_parse_stacksize, __kmp_stg_print_stacksize,
+     NULL, 0, 0},
+
+    {"KMP_TASKING", __kmp_stg_parse_tasking, __kmp_stg_print_tasking, NULL, 0,
+     0},
+    {"KMP_TASK_STEALING_CONSTRAINT", __kmp_stg_parse_task_stealing,
+     __kmp_stg_print_task_stealing, NULL, 0, 0},
+    {"OMP_MAX_ACTIVE_LEVELS", __kmp_stg_parse_max_active_levels,
+     __kmp_stg_print_max_active_levels, NULL, 0, 0},
+    {"OMP_DEFAULT_DEVICE", __kmp_stg_parse_default_device,
+     __kmp_stg_print_default_device, NULL, 0, 0},
+    {"OMP_TARGET_OFFLOAD", __kmp_stg_parse_target_offload,
+     __kmp_stg_print_target_offload, NULL, 0, 0},
+    {"OMP_MAX_TASK_PRIORITY", __kmp_stg_parse_max_task_priority,
+     __kmp_stg_print_max_task_priority, NULL, 0, 0},
+    {"KMP_TASKLOOP_MIN_TASKS", __kmp_stg_parse_taskloop_min_tasks,
+     __kmp_stg_print_taskloop_min_tasks, NULL, 0, 0},
+    {"OMP_THREAD_LIMIT", __kmp_stg_parse_thread_limit,
+     __kmp_stg_print_thread_limit, NULL, 0, 0},
+    {"KMP_TEAMS_THREAD_LIMIT", __kmp_stg_parse_teams_thread_limit,
+     __kmp_stg_print_teams_thread_limit, NULL, 0, 0},
+    {"OMP_WAIT_POLICY", __kmp_stg_parse_wait_policy,
+     __kmp_stg_print_wait_policy, NULL, 0, 0},
+    {"KMP_DISP_NUM_BUFFERS", __kmp_stg_parse_disp_buffers,
+     __kmp_stg_print_disp_buffers, NULL, 0, 0},
+#if KMP_NESTED_HOT_TEAMS
+    {"KMP_HOT_TEAMS_MAX_LEVEL", __kmp_stg_parse_hot_teams_level,
+     __kmp_stg_print_hot_teams_level, NULL, 0, 0},
+    {"KMP_HOT_TEAMS_MODE", __kmp_stg_parse_hot_teams_mode,
+     __kmp_stg_print_hot_teams_mode, NULL, 0, 0},
+#endif // KMP_NESTED_HOT_TEAMS
+
+#if KMP_HANDLE_SIGNALS
+    {"KMP_HANDLE_SIGNALS", __kmp_stg_parse_handle_signals,
+     __kmp_stg_print_handle_signals, NULL, 0, 0},
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+    {"KMP_INHERIT_FP_CONTROL", __kmp_stg_parse_inherit_fp_control,
+     __kmp_stg_print_inherit_fp_control, NULL, 0, 0},
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef KMP_GOMP_COMPAT
+    {"GOMP_STACKSIZE", __kmp_stg_parse_stacksize, NULL, NULL, 0, 0},
+#endif
+
+#ifdef KMP_DEBUG
+    {"KMP_A_DEBUG", __kmp_stg_parse_a_debug, __kmp_stg_print_a_debug, NULL, 0,
+     0},
+    {"KMP_B_DEBUG", __kmp_stg_parse_b_debug, __kmp_stg_print_b_debug, NULL, 0,
+     0},
+    {"KMP_C_DEBUG", __kmp_stg_parse_c_debug, __kmp_stg_print_c_debug, NULL, 0,
+     0},
+    {"KMP_D_DEBUG", __kmp_stg_parse_d_debug, __kmp_stg_print_d_debug, NULL, 0,
+     0},
+    {"KMP_E_DEBUG", __kmp_stg_parse_e_debug, __kmp_stg_print_e_debug, NULL, 0,
+     0},
+    {"KMP_F_DEBUG", __kmp_stg_parse_f_debug, __kmp_stg_print_f_debug, NULL, 0,
+     0},
+    {"KMP_DEBUG", __kmp_stg_parse_debug, NULL, /* no print */ NULL, 0, 0},
+    {"KMP_DEBUG_BUF", __kmp_stg_parse_debug_buf, __kmp_stg_print_debug_buf,
+     NULL, 0, 0},
+    {"KMP_DEBUG_BUF_ATOMIC", __kmp_stg_parse_debug_buf_atomic,
+     __kmp_stg_print_debug_buf_atomic, NULL, 0, 0},
+    {"KMP_DEBUG_BUF_CHARS", __kmp_stg_parse_debug_buf_chars,
+     __kmp_stg_print_debug_buf_chars, NULL, 0, 0},
+    {"KMP_DEBUG_BUF_LINES", __kmp_stg_parse_debug_buf_lines,
+     __kmp_stg_print_debug_buf_lines, NULL, 0, 0},
+    {"KMP_DIAG", __kmp_stg_parse_diag, __kmp_stg_print_diag, NULL, 0, 0},
+
+    {"KMP_PAR_RANGE", __kmp_stg_parse_par_range_env,
+     __kmp_stg_print_par_range_env, NULL, 0, 0},
+#endif // KMP_DEBUG
+
+    {"KMP_ALIGN_ALLOC", __kmp_stg_parse_align_alloc,
+     __kmp_stg_print_align_alloc, NULL, 0, 0},
+
+    {"KMP_PLAIN_BARRIER", __kmp_stg_parse_barrier_branch_bit,
+     __kmp_stg_print_barrier_branch_bit, NULL, 0, 0},
+    {"KMP_PLAIN_BARRIER_PATTERN", __kmp_stg_parse_barrier_pattern,
+     __kmp_stg_print_barrier_pattern, NULL, 0, 0},
+    {"KMP_FORKJOIN_BARRIER", __kmp_stg_parse_barrier_branch_bit,
+     __kmp_stg_print_barrier_branch_bit, NULL, 0, 0},
+    {"KMP_FORKJOIN_BARRIER_PATTERN", __kmp_stg_parse_barrier_pattern,
+     __kmp_stg_print_barrier_pattern, NULL, 0, 0},
+#if KMP_FAST_REDUCTION_BARRIER
+    {"KMP_REDUCTION_BARRIER", __kmp_stg_parse_barrier_branch_bit,
+     __kmp_stg_print_barrier_branch_bit, NULL, 0, 0},
+    {"KMP_REDUCTION_BARRIER_PATTERN", __kmp_stg_parse_barrier_pattern,
+     __kmp_stg_print_barrier_pattern, NULL, 0, 0},
+#endif
+
+    {"KMP_ABORT_DELAY", __kmp_stg_parse_abort_delay,
+     __kmp_stg_print_abort_delay, NULL, 0, 0},
+    {"KMP_CPUINFO_FILE", __kmp_stg_parse_cpuinfo_file,
+     __kmp_stg_print_cpuinfo_file, NULL, 0, 0},
+    {"KMP_FORCE_REDUCTION", __kmp_stg_parse_force_reduction,
+     __kmp_stg_print_force_reduction, NULL, 0, 0},
+    {"KMP_DETERMINISTIC_REDUCTION", __kmp_stg_parse_force_reduction,
+     __kmp_stg_print_force_reduction, NULL, 0, 0},
+    {"KMP_STORAGE_MAP", __kmp_stg_parse_storage_map,
+     __kmp_stg_print_storage_map, NULL, 0, 0},
+    {"KMP_ALL_THREADPRIVATE", __kmp_stg_parse_all_threadprivate,
+     __kmp_stg_print_all_threadprivate, NULL, 0, 0},
+    {"KMP_FOREIGN_THREADS_THREADPRIVATE",
+     __kmp_stg_parse_foreign_threads_threadprivate,
+     __kmp_stg_print_foreign_threads_threadprivate, NULL, 0, 0},
+
+#if KMP_AFFINITY_SUPPORTED
+    {"KMP_AFFINITY", __kmp_stg_parse_affinity, __kmp_stg_print_affinity, NULL,
+     0, 0},
+#ifdef KMP_GOMP_COMPAT
+    {"GOMP_CPU_AFFINITY", __kmp_stg_parse_gomp_cpu_affinity, NULL,
+     /* no print */ NULL, 0, 0},
+#endif /* KMP_GOMP_COMPAT */
+    {"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind,
+     NULL, 0, 0},
+    {"OMP_PLACES", __kmp_stg_parse_places, __kmp_stg_print_places, NULL, 0, 0},
+    {"KMP_TOPOLOGY_METHOD", __kmp_stg_parse_topology_method,
+     __kmp_stg_print_topology_method, NULL, 0, 0},
+
+#else
+
+    // KMP_AFFINITY is not supported on OS X*, nor is OMP_PLACES.
+    // OMP_PROC_BIND and proc-bind-var are supported, however.
+    {"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind,
+     NULL, 0, 0},
+
+#endif // KMP_AFFINITY_SUPPORTED
+    {"OMP_DISPLAY_AFFINITY", __kmp_stg_parse_display_affinity,
+     __kmp_stg_print_display_affinity, NULL, 0, 0},
+    {"OMP_AFFINITY_FORMAT", __kmp_stg_parse_affinity_format,
+     __kmp_stg_print_affinity_format, NULL, 0, 0},
+    {"KMP_INIT_AT_FORK", __kmp_stg_parse_init_at_fork,
+     __kmp_stg_print_init_at_fork, NULL, 0, 0},
+    {"KMP_SCHEDULE", __kmp_stg_parse_schedule, __kmp_stg_print_schedule, NULL,
+     0, 0},
+    {"OMP_SCHEDULE", __kmp_stg_parse_omp_schedule, __kmp_stg_print_omp_schedule,
+     NULL, 0, 0},
+#if KMP_USE_HIER_SCHED
+    {"KMP_DISP_HAND_THREAD", __kmp_stg_parse_kmp_hand_thread,
+     __kmp_stg_print_kmp_hand_thread, NULL, 0, 0},
+#endif
+    {"KMP_ATOMIC_MODE", __kmp_stg_parse_atomic_mode,
+     __kmp_stg_print_atomic_mode, NULL, 0, 0},
+    {"KMP_CONSISTENCY_CHECK", __kmp_stg_parse_consistency_check,
+     __kmp_stg_print_consistency_check, NULL, 0, 0},
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    {"KMP_ITT_PREPARE_DELAY", __kmp_stg_parse_itt_prepare_delay,
+     __kmp_stg_print_itt_prepare_delay, NULL, 0, 0},
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+    {"KMP_MALLOC_POOL_INCR", __kmp_stg_parse_malloc_pool_incr,
+     __kmp_stg_print_malloc_pool_incr, NULL, 0, 0},
+    {"KMP_GTID_MODE", __kmp_stg_parse_gtid_mode, __kmp_stg_print_gtid_mode,
+     NULL, 0, 0},
+    {"OMP_DYNAMIC", __kmp_stg_parse_omp_dynamic, __kmp_stg_print_omp_dynamic,
+     NULL, 0, 0},
+    {"KMP_DYNAMIC_MODE", __kmp_stg_parse_kmp_dynamic_mode,
+     __kmp_stg_print_kmp_dynamic_mode, NULL, 0, 0},
+
+#ifdef USE_LOAD_BALANCE
+    {"KMP_LOAD_BALANCE_INTERVAL", __kmp_stg_parse_ld_balance_interval,
+     __kmp_stg_print_ld_balance_interval, NULL, 0, 0},
+#endif
+
+    {"KMP_NUM_LOCKS_IN_BLOCK", __kmp_stg_parse_lock_block,
+     __kmp_stg_print_lock_block, NULL, 0, 0},
+    {"KMP_LOCK_KIND", __kmp_stg_parse_lock_kind, __kmp_stg_print_lock_kind,
+     NULL, 0, 0},
+    {"KMP_SPIN_BACKOFF_PARAMS", __kmp_stg_parse_spin_backoff_params,
+     __kmp_stg_print_spin_backoff_params, NULL, 0, 0},
+#if KMP_USE_ADAPTIVE_LOCKS
+    {"KMP_ADAPTIVE_LOCK_PROPS", __kmp_stg_parse_adaptive_lock_props,
+     __kmp_stg_print_adaptive_lock_props, NULL, 0, 0},
+#if KMP_DEBUG_ADAPTIVE_LOCKS
+    {"KMP_SPECULATIVE_STATSFILE", __kmp_stg_parse_speculative_statsfile,
+     __kmp_stg_print_speculative_statsfile, NULL, 0, 0},
+#endif
+#endif // KMP_USE_ADAPTIVE_LOCKS
+    {"KMP_PLACE_THREADS", __kmp_stg_parse_hw_subset, __kmp_stg_print_hw_subset,
+     NULL, 0, 0},
+    {"KMP_HW_SUBSET", __kmp_stg_parse_hw_subset, __kmp_stg_print_hw_subset,
+     NULL, 0, 0},
+#if USE_ITT_BUILD
+    {"KMP_FORKJOIN_FRAMES", __kmp_stg_parse_forkjoin_frames,
+     __kmp_stg_print_forkjoin_frames, NULL, 0, 0},
+    {"KMP_FORKJOIN_FRAMES_MODE", __kmp_stg_parse_forkjoin_frames_mode,
+     __kmp_stg_print_forkjoin_frames_mode, NULL, 0, 0},
+#endif
+    {"KMP_ENABLE_TASK_THROTTLING", __kmp_stg_parse_task_throttling,
+     __kmp_stg_print_task_throttling, NULL, 0, 0},
+
+    {"OMP_DISPLAY_ENV", __kmp_stg_parse_omp_display_env,
+     __kmp_stg_print_omp_display_env, NULL, 0, 0},
+    {"OMP_CANCELLATION", __kmp_stg_parse_omp_cancellation,
+     __kmp_stg_print_omp_cancellation, NULL, 0, 0},
+    {"OMP_ALLOCATOR", __kmp_stg_parse_allocator, __kmp_stg_print_allocator,
+     NULL, 0, 0},
+
+#if OMPT_SUPPORT
+    {"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0,
+     0},
+    {"OMP_TOOL_LIBRARIES", __kmp_stg_parse_omp_tool_libraries,
+     __kmp_stg_print_omp_tool_libraries, NULL, 0, 0},
+#endif
+
+    {"", NULL, NULL, NULL, 0, 0}}; // settings
+
+static int const __kmp_stg_count =
+    sizeof(__kmp_stg_table) / sizeof(kmp_setting_t);
+
+static inline kmp_setting_t *__kmp_stg_find(char const *name) {
+
+  int i;
+  if (name != NULL) {
+    for (i = 0; i < __kmp_stg_count; ++i) {
+      if (strcmp(__kmp_stg_table[i].name, name) == 0) {
+        return &__kmp_stg_table[i];
+      }
+    }
+  }
+  return NULL;
+
+} // __kmp_stg_find
+
+static int __kmp_stg_cmp(void const *_a, void const *_b) {
+  const kmp_setting_t *a = RCAST(const kmp_setting_t *, _a);
+  const kmp_setting_t *b = RCAST(const kmp_setting_t *, _b);
+
+  // Process KMP_AFFINITY last.
+  // It needs to come after OMP_PLACES and GOMP_CPU_AFFINITY.
+  if (strcmp(a->name, "KMP_AFFINITY") == 0) {
+    if (strcmp(b->name, "KMP_AFFINITY") == 0) {
+      return 0;
+    }
+    return 1;
+  } else if (strcmp(b->name, "KMP_AFFINITY") == 0) {
+    return -1;
+  }
+  return strcmp(a->name, b->name);
+} // __kmp_stg_cmp
+
+static void __kmp_stg_init(void) {
+
+  static int initialized = 0;
+
+  if (!initialized) {
+
+    // Sort table.
+    qsort(__kmp_stg_table, __kmp_stg_count - 1, sizeof(kmp_setting_t),
+          __kmp_stg_cmp);
+
+    { // Initialize *_STACKSIZE data.
+      kmp_setting_t *kmp_stacksize =
+          __kmp_stg_find("KMP_STACKSIZE"); // 1st priority.
+#ifdef KMP_GOMP_COMPAT
+      kmp_setting_t *gomp_stacksize =
+          __kmp_stg_find("GOMP_STACKSIZE"); // 2nd priority.
+#endif
+      kmp_setting_t *omp_stacksize =
+          __kmp_stg_find("OMP_STACKSIZE"); // 3rd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      // !!! Compiler does not understand rivals is used and optimizes out
+      // assignments
+      // !!!     rivals[ i ++ ] = ...;
+      static kmp_setting_t *volatile rivals[4];
+      static kmp_stg_ss_data_t kmp_data = {1, CCAST(kmp_setting_t **, rivals)};
+#ifdef KMP_GOMP_COMPAT
+      static kmp_stg_ss_data_t gomp_data = {1024,
+                                            CCAST(kmp_setting_t **, rivals)};
+#endif
+      static kmp_stg_ss_data_t omp_data = {1024,
+                                           CCAST(kmp_setting_t **, rivals)};
+      int i = 0;
+
+      rivals[i++] = kmp_stacksize;
+#ifdef KMP_GOMP_COMPAT
+      if (gomp_stacksize != NULL) {
+        rivals[i++] = gomp_stacksize;
+      }
+#endif
+      rivals[i++] = omp_stacksize;
+      rivals[i++] = NULL;
+
+      kmp_stacksize->data = &kmp_data;
+#ifdef KMP_GOMP_COMPAT
+      if (gomp_stacksize != NULL) {
+        gomp_stacksize->data = &gomp_data;
+      }
+#endif
+      omp_stacksize->data = &omp_data;
+    }
+
+    { // Initialize KMP_LIBRARY and OMP_WAIT_POLICY data.
+      kmp_setting_t *kmp_library =
+          __kmp_stg_find("KMP_LIBRARY"); // 1st priority.
+      kmp_setting_t *omp_wait_policy =
+          __kmp_stg_find("OMP_WAIT_POLICY"); // 2nd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      static kmp_stg_wp_data_t kmp_data = {0, CCAST(kmp_setting_t **, rivals)};
+      static kmp_stg_wp_data_t omp_data = {1, CCAST(kmp_setting_t **, rivals)};
+      int i = 0;
+
+      rivals[i++] = kmp_library;
+      if (omp_wait_policy != NULL) {
+        rivals[i++] = omp_wait_policy;
+      }
+      rivals[i++] = NULL;
+
+      kmp_library->data = &kmp_data;
+      if (omp_wait_policy != NULL) {
+        omp_wait_policy->data = &omp_data;
+      }
+    }
+
+    { // Initialize KMP_DEVICE_THREAD_LIMIT and KMP_ALL_THREADS
+      kmp_setting_t *kmp_device_thread_limit =
+          __kmp_stg_find("KMP_DEVICE_THREAD_LIMIT"); // 1st priority.
+      kmp_setting_t *kmp_all_threads =
+          __kmp_stg_find("KMP_ALL_THREADS"); // 2nd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      int i = 0;
+
+      rivals[i++] = kmp_device_thread_limit;
+      rivals[i++] = kmp_all_threads;
+      rivals[i++] = NULL;
+
+      kmp_device_thread_limit->data = CCAST(kmp_setting_t **, rivals);
+      kmp_all_threads->data = CCAST(kmp_setting_t **, rivals);
+    }
+
+    { // Initialize KMP_HW_SUBSET and KMP_PLACE_THREADS
+      // 1st priority
+      kmp_setting_t *kmp_hw_subset = __kmp_stg_find("KMP_HW_SUBSET");
+      // 2nd priority
+      kmp_setting_t *kmp_place_threads = __kmp_stg_find("KMP_PLACE_THREADS");
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      int i = 0;
+
+      rivals[i++] = kmp_hw_subset;
+      rivals[i++] = kmp_place_threads;
+      rivals[i++] = NULL;
+
+      kmp_hw_subset->data = CCAST(kmp_setting_t **, rivals);
+      kmp_place_threads->data = CCAST(kmp_setting_t **, rivals);
+    }
+
+#if KMP_AFFINITY_SUPPORTED
+    { // Initialize KMP_AFFINITY, GOMP_CPU_AFFINITY, and OMP_PROC_BIND data.
+      kmp_setting_t *kmp_affinity =
+          __kmp_stg_find("KMP_AFFINITY"); // 1st priority.
+      KMP_DEBUG_ASSERT(kmp_affinity != NULL);
+
+#ifdef KMP_GOMP_COMPAT
+      kmp_setting_t *gomp_cpu_affinity =
+          __kmp_stg_find("GOMP_CPU_AFFINITY"); // 2nd priority.
+      KMP_DEBUG_ASSERT(gomp_cpu_affinity != NULL);
+#endif
+
+      kmp_setting_t *omp_proc_bind =
+          __kmp_stg_find("OMP_PROC_BIND"); // 3rd priority.
+      KMP_DEBUG_ASSERT(omp_proc_bind != NULL);
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[4];
+      int i = 0;
+
+      rivals[i++] = kmp_affinity;
+
+#ifdef KMP_GOMP_COMPAT
+      rivals[i++] = gomp_cpu_affinity;
+      gomp_cpu_affinity->data = CCAST(kmp_setting_t **, rivals);
+#endif
+
+      rivals[i++] = omp_proc_bind;
+      omp_proc_bind->data = CCAST(kmp_setting_t **, rivals);
+      rivals[i++] = NULL;
+
+      static kmp_setting_t *volatile places_rivals[4];
+      i = 0;
+
+      kmp_setting_t *omp_places = __kmp_stg_find("OMP_PLACES"); // 3rd priority.
+      KMP_DEBUG_ASSERT(omp_places != NULL);
+
+      places_rivals[i++] = kmp_affinity;
+#ifdef KMP_GOMP_COMPAT
+      places_rivals[i++] = gomp_cpu_affinity;
+#endif
+      places_rivals[i++] = omp_places;
+      omp_places->data = CCAST(kmp_setting_t **, places_rivals);
+      places_rivals[i++] = NULL;
+    }
+#else
+// KMP_AFFINITY not supported, so OMP_PROC_BIND has no rivals.
+// OMP_PLACES not supported yet.
+#endif // KMP_AFFINITY_SUPPORTED
+
+    { // Initialize KMP_DETERMINISTIC_REDUCTION and KMP_FORCE_REDUCTION data.
+      kmp_setting_t *kmp_force_red =
+          __kmp_stg_find("KMP_FORCE_REDUCTION"); // 1st priority.
+      kmp_setting_t *kmp_determ_red =
+          __kmp_stg_find("KMP_DETERMINISTIC_REDUCTION"); // 2nd priority.
+
+      // !!! volatile keyword is Intel(R) C Compiler bug CQ49908 workaround.
+      static kmp_setting_t *volatile rivals[3];
+      static kmp_stg_fr_data_t force_data = {1,
+                                             CCAST(kmp_setting_t **, rivals)};
+      static kmp_stg_fr_data_t determ_data = {0,
+                                              CCAST(kmp_setting_t **, rivals)};
+      int i = 0;
+
+      rivals[i++] = kmp_force_red;
+      if (kmp_determ_red != NULL) {
+        rivals[i++] = kmp_determ_red;
+      }
+      rivals[i++] = NULL;
+
+      kmp_force_red->data = &force_data;
+      if (kmp_determ_red != NULL) {
+        kmp_determ_red->data = &determ_data;
+      }
+    }
+
+    initialized = 1;
+  }
+
+  // Reset flags.
+  int i;
+  for (i = 0; i < __kmp_stg_count; ++i) {
+    __kmp_stg_table[i].set = 0;
+  }
+
+} // __kmp_stg_init
+
+static void __kmp_stg_parse(char const *name, char const *value) {
+  // On Windows* OS there are some nameless variables like "C:=C:\" (yeah,
+  // really nameless, they are presented in environment block as
+  // "=C:=C\\\x00=D:=D:\\\x00...", so let us skip them.
+  if (name[0] == 0) {
+    return;
+  }
+
+  if (value != NULL) {
+    kmp_setting_t *setting = __kmp_stg_find(name);
+    if (setting != NULL) {
+      setting->parse(name, value, setting->data);
+      setting->defined = 1;
+    }
+  }
+
+} // __kmp_stg_parse
+
+static int __kmp_stg_check_rivals( // 0 -- Ok, 1 -- errors found.
+    char const *name, // Name of variable.
+    char const *value, // Value of the variable.
+    kmp_setting_t **rivals // List of rival settings (must include current one).
+    ) {
+
+  if (rivals == NULL) {
+    return 0;
+  }
+
+  // Loop thru higher priority settings (listed before current).
+  int i = 0;
+  for (; strcmp(rivals[i]->name, name) != 0; i++) {
+    KMP_DEBUG_ASSERT(rivals[i] != NULL);
+
+#if KMP_AFFINITY_SUPPORTED
+    if (rivals[i] == __kmp_affinity_notype) {
+      // If KMP_AFFINITY is specified without a type name,
+      // it does not rival OMP_PROC_BIND or GOMP_CPU_AFFINITY.
+      continue;
+    }
+#endif
+
+    if (rivals[i]->set) {
+      KMP_WARNING(StgIgnored, name, rivals[i]->name);
+      return 1;
+    }
+  }
+
+  ++i; // Skip current setting.
+  return 0;
+
+} // __kmp_stg_check_rivals
+
+static int __kmp_env_toPrint(char const *name, int flag) {
+  int rc = 0;
+  kmp_setting_t *setting = __kmp_stg_find(name);
+  if (setting != NULL) {
+    rc = setting->defined;
+    if (flag >= 0) {
+      setting->defined = flag;
+    }
+  }
+  return rc;
+}
+
+static void __kmp_aux_env_initialize(kmp_env_blk_t *block) {
+
+  char const *value;
+
+  /* OMP_NUM_THREADS */
+  value = __kmp_env_blk_var(block, "OMP_NUM_THREADS");
+  if (value) {
+    ompc_set_num_threads(__kmp_dflt_team_nth);
+  }
+
+  /* KMP_BLOCKTIME */
+  value = __kmp_env_blk_var(block, "KMP_BLOCKTIME");
+  if (value) {
+    kmpc_set_blocktime(__kmp_dflt_blocktime);
+  }
+
+  /* OMP_NESTED */
+  value = __kmp_env_blk_var(block, "OMP_NESTED");
+  if (value) {
+    ompc_set_nested(__kmp_dflt_max_active_levels > 1);
+  }
+
+  /* OMP_DYNAMIC */
+  value = __kmp_env_blk_var(block, "OMP_DYNAMIC");
+  if (value) {
+    ompc_set_dynamic(__kmp_global.g.g_dynamic);
+  }
+}
+
+void __kmp_env_initialize(char const *string) {
+
+  kmp_env_blk_t block;
+  int i;
+
+  __kmp_stg_init();
+
+  // Hack!!!
+  if (string == NULL) {
+    // __kmp_max_nth = __kmp_sys_max_nth;
+    __kmp_threads_capacity =
+        __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
+  }
+  __kmp_env_blk_init(&block, string);
+
+  // update the set flag on all entries that have an env var
+  for (i = 0; i < block.count; ++i) {
+    if ((block.vars[i].name == NULL) || (*block.vars[i].name == '\0')) {
+      continue;
+    }
+    if (block.vars[i].value == NULL) {
+      continue;
+    }
+    kmp_setting_t *setting = __kmp_stg_find(block.vars[i].name);
+    if (setting != NULL) {
+      setting->set = 1;
+    }
+  }
+
+  // We need to know if blocktime was set when processing OMP_WAIT_POLICY
+  blocktime_str = __kmp_env_blk_var(&block, "KMP_BLOCKTIME");
+
+  // Special case. If we parse environment, not a string, process KMP_WARNINGS
+  // first.
+  if (string == NULL) {
+    char const *name = "KMP_WARNINGS";
+    char const *value = __kmp_env_blk_var(&block, name);
+    __kmp_stg_parse(name, value);
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+  // Special case. KMP_AFFINITY is not a rival to other affinity env vars
+  // if no affinity type is specified.  We want to allow
+  // KMP_AFFINITY=[no],verbose/[no]warnings/etc.  to be enabled when
+  // specifying the affinity type via GOMP_CPU_AFFINITY or the OMP 4.0
+  // affinity mechanism.
+  __kmp_affinity_notype = NULL;
+  char const *aff_str = __kmp_env_blk_var(&block, "KMP_AFFINITY");
+  if (aff_str != NULL) {
+// Check if the KMP_AFFINITY type is specified in the string.
+// We just search the string for "compact", "scatter", etc.
+// without really parsing the string.  The syntax of the
+// KMP_AFFINITY env var is such that none of the affinity
+// type names can appear anywhere other that the type
+// specifier, even as substrings.
+//
+// I can't find a case-insensitive version of strstr on Windows* OS.
+// Use the case-sensitive version for now.
+
+#if KMP_OS_WINDOWS
+#define FIND strstr
+#else
+#define FIND strcasestr
+#endif
+
+    if ((FIND(aff_str, "none") == NULL) &&
+        (FIND(aff_str, "physical") == NULL) &&
+        (FIND(aff_str, "logical") == NULL) &&
+        (FIND(aff_str, "compact") == NULL) &&
+        (FIND(aff_str, "scatter") == NULL) &&
+        (FIND(aff_str, "explicit") == NULL) &&
+        (FIND(aff_str, "balanced") == NULL) &&
+        (FIND(aff_str, "disabled") == NULL)) {
+      __kmp_affinity_notype = __kmp_stg_find("KMP_AFFINITY");
+    } else {
+      // A new affinity type is specified.
+      // Reset the affinity flags to their default values,
+      // in case this is called from kmp_set_defaults().
+      __kmp_affinity_type = affinity_default;
+      __kmp_affinity_gran = affinity_gran_default;
+      __kmp_affinity_top_method = affinity_top_method_default;
+      __kmp_affinity_respect_mask = affinity_respect_mask_default;
+    }
+#undef FIND
+
+    // Also reset the affinity flags if OMP_PROC_BIND is specified.
+    aff_str = __kmp_env_blk_var(&block, "OMP_PROC_BIND");
+    if (aff_str != NULL) {
+      __kmp_affinity_type = affinity_default;
+      __kmp_affinity_gran = affinity_gran_default;
+      __kmp_affinity_top_method = affinity_top_method_default;
+      __kmp_affinity_respect_mask = affinity_respect_mask_default;
+    }
+  }
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+  // Set up the nested proc bind type vector.
+  if (__kmp_nested_proc_bind.bind_types == NULL) {
+    __kmp_nested_proc_bind.bind_types =
+        (kmp_proc_bind_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_proc_bind_t));
+    if (__kmp_nested_proc_bind.bind_types == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    __kmp_nested_proc_bind.size = 1;
+    __kmp_nested_proc_bind.used = 1;
+#if KMP_AFFINITY_SUPPORTED
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_default;
+#else
+    // default proc bind is false if affinity not supported
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+#endif
+  }
+
+  // Set up the affinity format ICV
+  // Grab the default affinity format string from the message catalog
+  kmp_msg_t m =
+      __kmp_msg_format(kmp_i18n_msg_AffFormatDefault, "%P", "%i", "%n", "%A");
+  KMP_DEBUG_ASSERT(KMP_STRLEN(m.str) < KMP_AFFINITY_FORMAT_SIZE);
+
+  if (__kmp_affinity_format == NULL) {
+    __kmp_affinity_format =
+        (char *)KMP_INTERNAL_MALLOC(sizeof(char) * KMP_AFFINITY_FORMAT_SIZE);
+  }
+  KMP_STRCPY_S(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE, m.str);
+  __kmp_str_free(&m.str);
+
+  // Now process all of the settings.
+  for (i = 0; i < block.count; ++i) {
+    __kmp_stg_parse(block.vars[i].name, block.vars[i].value);
+  }
+
+  // If user locks have been allocated yet, don't reset the lock vptr table.
+  if (!__kmp_init_user_locks) {
+    if (__kmp_user_lock_kind == lk_default) {
+      __kmp_user_lock_kind = lk_queuing;
+    }
+#if KMP_USE_DYNAMIC_LOCK
+    __kmp_init_dynamic_user_locks();
+#else
+    __kmp_set_user_lock_vptrs(__kmp_user_lock_kind);
+#endif
+  } else {
+    KMP_DEBUG_ASSERT(string != NULL); // kmp_set_defaults() was called
+    KMP_DEBUG_ASSERT(__kmp_user_lock_kind != lk_default);
+// Binds lock functions again to follow the transition between different
+// KMP_CONSISTENCY_CHECK values. Calling this again is harmless as long
+// as we do not allow lock kind changes after making a call to any
+// user lock functions (true).
+#if KMP_USE_DYNAMIC_LOCK
+    __kmp_init_dynamic_user_locks();
+#else
+    __kmp_set_user_lock_vptrs(__kmp_user_lock_kind);
+#endif
+  }
+
+#if KMP_AFFINITY_SUPPORTED
+
+  if (!TCR_4(__kmp_init_middle)) {
+#if KMP_USE_HWLOC
+    // Force using hwloc when either tiles or numa nodes requested within
+    // KMP_HW_SUBSET and no other topology method is requested
+    if ((__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0 ||
+         __kmp_affinity_gran == affinity_gran_tile) &&
+        (__kmp_affinity_top_method == affinity_top_method_default)) {
+      __kmp_affinity_top_method = affinity_top_method_hwloc;
+    }
+#endif
+    // Determine if the machine/OS is actually capable of supporting
+    // affinity.
+    const char *var = "KMP_AFFINITY";
+    KMPAffinity::pick_api();
+#if KMP_USE_HWLOC
+    // If Hwloc topology discovery was requested but affinity was also disabled,
+    // then tell user that Hwloc request is being ignored and use default
+    // topology discovery method.
+    if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
+        __kmp_affinity_dispatch->get_api_type() != KMPAffinity::HWLOC) {
+      KMP_WARNING(AffIgnoringHwloc, var);
+      __kmp_affinity_top_method = affinity_top_method_all;
+    }
+#endif
+    if (__kmp_affinity_type == affinity_disabled) {
+      KMP_AFFINITY_DISABLE();
+    } else if (!KMP_AFFINITY_CAPABLE()) {
+      __kmp_affinity_dispatch->determine_capable(var);
+      if (!KMP_AFFINITY_CAPABLE()) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_default) &&
+             (__kmp_affinity_type != affinity_none) &&
+             (__kmp_affinity_type != affinity_disabled))) {
+          KMP_WARNING(AffNotSupported, var);
+        }
+        __kmp_affinity_type = affinity_disabled;
+        __kmp_affinity_respect_mask = 0;
+        __kmp_affinity_gran = affinity_gran_fine;
+      }
+    }
+
+    if (__kmp_affinity_type == affinity_disabled) {
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+    } else if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_true) {
+      // OMP_PROC_BIND=true maps to OMP_PROC_BIND=spread.
+      __kmp_nested_proc_bind.bind_types[0] = proc_bind_spread;
+    }
+
+    if (KMP_AFFINITY_CAPABLE()) {
+
+#if KMP_GROUP_AFFINITY
+      // This checks to see if the initial affinity mask is equal
+      // to a single windows processor group.  If it is, then we do
+      // not respect the initial affinity mask and instead, use the
+      // entire machine.
+      bool exactly_one_group = false;
+      if (__kmp_num_proc_groups > 1) {
+        int group;
+        bool within_one_group;
+        // Get the initial affinity mask and determine if it is
+        // contained within a single group.
+        kmp_affin_mask_t *init_mask;
+        KMP_CPU_ALLOC(init_mask);
+        __kmp_get_system_affinity(init_mask, TRUE);
+        group = __kmp_get_proc_group(init_mask);
+        within_one_group = (group >= 0);
+        // If the initial affinity is within a single group,
+        // then determine if it is equal to that single group.
+        if (within_one_group) {
+          DWORD num_bits_in_group = __kmp_GetActiveProcessorCount(group);
+          DWORD num_bits_in_mask = 0;
+          for (int bit = init_mask->begin(); bit != init_mask->end();
+               bit = init_mask->next(bit))
+            num_bits_in_mask++;
+          exactly_one_group = (num_bits_in_group == num_bits_in_mask);
+        }
+        KMP_CPU_FREE(init_mask);
+      }
+
+      // Handle the Win 64 group affinity stuff if there are multiple
+      // processor groups, or if the user requested it, and OMP 4.0
+      // affinity is not in effect.
+      if (((__kmp_num_proc_groups > 1) &&
+           (__kmp_affinity_type == affinity_default) &&
+           (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)) ||
+          (__kmp_affinity_top_method == affinity_top_method_group)) {
+        if (__kmp_affinity_respect_mask == affinity_respect_mask_default &&
+            exactly_one_group) {
+          __kmp_affinity_respect_mask = FALSE;
+        }
+        if (__kmp_affinity_type == affinity_default) {
+          __kmp_affinity_type = affinity_compact;
+          __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+        }
+        if (__kmp_affinity_top_method == affinity_top_method_default) {
+          if (__kmp_affinity_gran == affinity_gran_default) {
+            __kmp_affinity_top_method = affinity_top_method_group;
+            __kmp_affinity_gran = affinity_gran_group;
+          } else if (__kmp_affinity_gran == affinity_gran_group) {
+            __kmp_affinity_top_method = affinity_top_method_group;
+          } else {
+            __kmp_affinity_top_method = affinity_top_method_all;
+          }
+        } else if (__kmp_affinity_top_method == affinity_top_method_group) {
+          if (__kmp_affinity_gran == affinity_gran_default) {
+            __kmp_affinity_gran = affinity_gran_group;
+          } else if ((__kmp_affinity_gran != affinity_gran_group) &&
+                     (__kmp_affinity_gran != affinity_gran_fine) &&
+                     (__kmp_affinity_gran != affinity_gran_thread)) {
+            const char *str = NULL;
+            switch (__kmp_affinity_gran) {
+            case affinity_gran_core:
+              str = "core";
+              break;
+            case affinity_gran_package:
+              str = "package";
+              break;
+            case affinity_gran_node:
+              str = "node";
+              break;
+            case affinity_gran_tile:
+              str = "tile";
+              break;
+            default:
+              KMP_DEBUG_ASSERT(0);
+            }
+            KMP_WARNING(AffGranTopGroup, var, str);
+            __kmp_affinity_gran = affinity_gran_fine;
+          }
+        } else {
+          if (__kmp_affinity_gran == affinity_gran_default) {
+            __kmp_affinity_gran = affinity_gran_core;
+          } else if (__kmp_affinity_gran == affinity_gran_group) {
+            const char *str = NULL;
+            switch (__kmp_affinity_type) {
+            case affinity_physical:
+              str = "physical";
+              break;
+            case affinity_logical:
+              str = "logical";
+              break;
+            case affinity_compact:
+              str = "compact";
+              break;
+            case affinity_scatter:
+              str = "scatter";
+              break;
+            case affinity_explicit:
+              str = "explicit";
+              break;
+            // No MIC on windows, so no affinity_balanced case
+            default:
+              KMP_DEBUG_ASSERT(0);
+            }
+            KMP_WARNING(AffGranGroupType, var, str);
+            __kmp_affinity_gran = affinity_gran_core;
+          }
+        }
+      } else
+
+#endif /* KMP_GROUP_AFFINITY */
+
+      {
+        if (__kmp_affinity_respect_mask == affinity_respect_mask_default) {
+#if KMP_GROUP_AFFINITY
+          if (__kmp_num_proc_groups > 1 && exactly_one_group) {
+            __kmp_affinity_respect_mask = FALSE;
+          } else
+#endif /* KMP_GROUP_AFFINITY */
+          {
+            __kmp_affinity_respect_mask = TRUE;
+          }
+        }
+        if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
+            (__kmp_nested_proc_bind.bind_types[0] != proc_bind_default)) {
+          if (__kmp_affinity_type == affinity_default) {
+            __kmp_affinity_type = affinity_compact;
+            __kmp_affinity_dups = FALSE;
+          }
+        } else if (__kmp_affinity_type == affinity_default) {
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+          } else
+#endif
+          {
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+          }
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_affinity_type = affinity_scatter;
+          } else
+#endif
+          {
+            __kmp_affinity_type = affinity_none;
+          }
+        }
+        if ((__kmp_affinity_gran == affinity_gran_default) &&
+            (__kmp_affinity_gran_levels < 0)) {
+#if KMP_MIC_SUPPORTED
+          if (__kmp_mic_type != non_mic) {
+            __kmp_affinity_gran = affinity_gran_fine;
+          } else
+#endif
+          {
+            __kmp_affinity_gran = affinity_gran_core;
+          }
+        }
+        if (__kmp_affinity_top_method == affinity_top_method_default) {
+          __kmp_affinity_top_method = affinity_top_method_all;
+        }
+      }
+    }
+
+    K_DIAG(1, ("__kmp_affinity_type         == %d\n", __kmp_affinity_type));
+    K_DIAG(1, ("__kmp_affinity_compact      == %d\n", __kmp_affinity_compact));
+    K_DIAG(1, ("__kmp_affinity_offset       == %d\n", __kmp_affinity_offset));
+    K_DIAG(1, ("__kmp_affinity_verbose      == %d\n", __kmp_affinity_verbose));
+    K_DIAG(1, ("__kmp_affinity_warnings     == %d\n", __kmp_affinity_warnings));
+    K_DIAG(1, ("__kmp_affinity_respect_mask == %d\n",
+               __kmp_affinity_respect_mask));
+    K_DIAG(1, ("__kmp_affinity_gran         == %d\n", __kmp_affinity_gran));
+
+    KMP_DEBUG_ASSERT(__kmp_affinity_type != affinity_default);
+    KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.bind_types[0] != proc_bind_default);
+    K_DIAG(1, ("__kmp_nested_proc_bind.bind_types[0] == %d\n",
+               __kmp_nested_proc_bind.bind_types[0]));
+  }
+
+#endif /* KMP_AFFINITY_SUPPORTED */
+
+  if (__kmp_version) {
+    __kmp_print_version_1();
+  }
+
+  // Post-initialization step: some env. vars need their value's further
+  // processing
+  if (string != NULL) { // kmp_set_defaults() was called
+    __kmp_aux_env_initialize(&block);
+  }
+
+  __kmp_env_blk_free(&block);
+
+  KMP_MB();
+
+} // __kmp_env_initialize
+
+void __kmp_env_print() {
+
+  kmp_env_blk_t block;
+  int i;
+  kmp_str_buf_t buffer;
+
+  __kmp_stg_init();
+  __kmp_str_buf_init(&buffer);
+
+  __kmp_env_blk_init(&block, NULL);
+  __kmp_env_blk_sort(&block);
+
+  // Print real environment values.
+  __kmp_str_buf_print(&buffer, "\n%s\n\n", KMP_I18N_STR(UserSettings));
+  for (i = 0; i < block.count; ++i) {
+    char const *name = block.vars[i].name;
+    char const *value = block.vars[i].value;
+    if ((KMP_STRLEN(name) > 4 && strncmp(name, "KMP_", 4) == 0) ||
+        strncmp(name, "OMP_", 4) == 0
+#ifdef KMP_GOMP_COMPAT
+        || strncmp(name, "GOMP_", 5) == 0
+#endif // KMP_GOMP_COMPAT
+        ) {
+      __kmp_str_buf_print(&buffer, "   %s=%s\n", name, value);
+    }
+  }
+  __kmp_str_buf_print(&buffer, "\n");
+
+  // Print internal (effective) settings.
+  __kmp_str_buf_print(&buffer, "%s\n\n", KMP_I18N_STR(EffectiveSettings));
+  for (int i = 0; i < __kmp_stg_count; ++i) {
+    if (__kmp_stg_table[i].print != NULL) {
+      __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name,
+                               __kmp_stg_table[i].data);
+    }
+  }
+
+  __kmp_printf("%s", buffer.str);
+
+  __kmp_env_blk_free(&block);
+  __kmp_str_buf_free(&buffer);
+
+  __kmp_printf("\n");
+
+} // __kmp_env_print
+
+void __kmp_env_print_2() {
+
+  kmp_env_blk_t block;
+  kmp_str_buf_t buffer;
+
+  __kmp_env_format = 1;
+
+  __kmp_stg_init();
+  __kmp_str_buf_init(&buffer);
+
+  __kmp_env_blk_init(&block, NULL);
+  __kmp_env_blk_sort(&block);
+
+  __kmp_str_buf_print(&buffer, "\n%s\n", KMP_I18N_STR(DisplayEnvBegin));
+  __kmp_str_buf_print(&buffer, "   _OPENMP='%d'\n", __kmp_openmp_version);
+
+  for (int i = 0; i < __kmp_stg_count; ++i) {
+    if (__kmp_stg_table[i].print != NULL &&
+        ((__kmp_display_env &&
+          strncmp(__kmp_stg_table[i].name, "OMP_", 4) == 0) ||
+         __kmp_display_env_verbose)) {
+      __kmp_stg_table[i].print(&buffer, __kmp_stg_table[i].name,
+                               __kmp_stg_table[i].data);
+    }
+  }
+
+  __kmp_str_buf_print(&buffer, "%s\n", KMP_I18N_STR(DisplayEnvEnd));
+  __kmp_str_buf_print(&buffer, "\n");
+
+  __kmp_printf("%s", buffer.str);
+
+  __kmp_env_blk_free(&block);
+  __kmp_str_buf_free(&buffer);
+
+  __kmp_printf("\n");
+
+} // __kmp_env_print_2
+
+// end of file
diff --git a/final/runtime/src/kmp_settings.h b/final/runtime/src/kmp_settings.h
new file mode 100644
index 0000000..3247ffc
--- /dev/null
+++ b/final/runtime/src/kmp_settings.h
@@ -0,0 +1,66 @@
+/*
+ * kmp_settings.h -- Initialize environment variables
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_SETTINGS_H
+#define KMP_SETTINGS_H
+
+void __kmp_reset_global_vars(void);
+void __kmp_env_initialize(char const *);
+void __kmp_env_print();
+void __kmp_env_print_2();
+
+int __kmp_initial_threads_capacity(int req_nproc);
+void __kmp_init_dflt_team_nth();
+int __kmp_convert_to_milliseconds(char const *);
+int __kmp_default_tp_capacity(int, int, int);
+
+#if KMP_MIC
+#define KMP_STR_BUF_PRINT_NAME                                                 \
+  __kmp_str_buf_print(buffer, "  %s %s", KMP_I18N_STR(Device), name)
+#define KMP_STR_BUF_PRINT_NAME_EX(x)                                           \
+  __kmp_str_buf_print(buffer, "  %s %s='", KMP_I18N_STR(Device), x)
+#define KMP_STR_BUF_PRINT_BOOL_EX(n, v, t, f)                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Device), n,       \
+                      (v) ? t : f)
+#define KMP_STR_BUF_PRINT_BOOL                                                 \
+  KMP_STR_BUF_PRINT_BOOL_EX(name, value, "TRUE", "FALSE")
+#define KMP_STR_BUF_PRINT_INT                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%d'\n", KMP_I18N_STR(Device), name,    \
+                      value)
+#define KMP_STR_BUF_PRINT_UINT64                                               \
+  __kmp_str_buf_print(buffer, "  %s %s='%" KMP_UINT64_SPEC "'\n",              \
+                      KMP_I18N_STR(Device), name, value);
+#define KMP_STR_BUF_PRINT_STR                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Device), name,    \
+                      value)
+#else
+#define KMP_STR_BUF_PRINT_NAME                                                 \
+  __kmp_str_buf_print(buffer, "  %s %s", KMP_I18N_STR(Host), name)
+#define KMP_STR_BUF_PRINT_NAME_EX(x)                                           \
+  __kmp_str_buf_print(buffer, "  %s %s='", KMP_I18N_STR(Host), x)
+#define KMP_STR_BUF_PRINT_BOOL_EX(n, v, t, f)                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Host), n,         \
+                      (v) ? t : f)
+#define KMP_STR_BUF_PRINT_BOOL                                                 \
+  KMP_STR_BUF_PRINT_BOOL_EX(name, value, "TRUE", "FALSE")
+#define KMP_STR_BUF_PRINT_INT                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%d'\n", KMP_I18N_STR(Host), name, value)
+#define KMP_STR_BUF_PRINT_UINT64                                               \
+  __kmp_str_buf_print(buffer, "  %s %s='%" KMP_UINT64_SPEC "'\n",              \
+                      KMP_I18N_STR(Host), name, value);
+#define KMP_STR_BUF_PRINT_STR                                                  \
+  __kmp_str_buf_print(buffer, "  %s %s='%s'\n", KMP_I18N_STR(Host), name, value)
+#endif
+
+#endif // KMP_SETTINGS_H
+
+// end of file //
diff --git a/final/runtime/src/kmp_stats.cpp b/final/runtime/src/kmp_stats.cpp
new file mode 100644
index 0000000..71f2dd9
--- /dev/null
+++ b/final/runtime/src/kmp_stats.cpp
@@ -0,0 +1,922 @@
+/** @file kmp_stats.cpp
+ * Statistics gathering and processing.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+
+#include <algorithm>
+#include <ctime>
+#include <iomanip>
+#include <sstream>
+#include <stdlib.h> // for atexit
+#include <cmath>
+
+#define STRINGIZE2(x) #x
+#define STRINGIZE(x) STRINGIZE2(x)
+
+#define expandName(name, flags, ignore) {STRINGIZE(name), flags},
+statInfo timeStat::timerInfo[] = {
+    KMP_FOREACH_TIMER(expandName, 0){"TIMER_LAST", 0}};
+const statInfo counter::counterInfo[] = {
+    KMP_FOREACH_COUNTER(expandName, 0){"COUNTER_LAST", 0}};
+#undef expandName
+
+#define expandName(ignore1, ignore2, ignore3) {0.0, 0.0, 0.0},
+kmp_stats_output_module::rgb_color kmp_stats_output_module::timerColorInfo[] = {
+    KMP_FOREACH_TIMER(expandName, 0){0.0, 0.0, 0.0}};
+#undef expandName
+
+const kmp_stats_output_module::rgb_color
+    kmp_stats_output_module::globalColorArray[] = {
+        {1.0, 0.0, 0.0}, // red
+        {1.0, 0.6, 0.0}, // orange
+        {1.0, 1.0, 0.0}, // yellow
+        {0.0, 1.0, 0.0}, // green
+        {0.0, 0.0, 1.0}, // blue
+        {0.6, 0.2, 0.8}, // purple
+        {1.0, 0.0, 1.0}, // magenta
+        {0.0, 0.4, 0.2}, // dark green
+        {1.0, 1.0, 0.6}, // light yellow
+        {0.6, 0.4, 0.6}, // dirty purple
+        {0.0, 1.0, 1.0}, // cyan
+        {1.0, 0.4, 0.8}, // pink
+        {0.5, 0.5, 0.5}, // grey
+        {0.8, 0.7, 0.5}, // brown
+        {0.6, 0.6, 1.0}, // light blue
+        {1.0, 0.7, 0.5}, // peach
+        {0.8, 0.5, 1.0}, // lavender
+        {0.6, 0.0, 0.0}, // dark red
+        {0.7, 0.6, 0.0}, // gold
+        {0.0, 0.0, 0.0} // black
+};
+
+// Ensure that the atexit handler only runs once.
+static uint32_t statsPrinted = 0;
+
+// output interface
+static kmp_stats_output_module *__kmp_stats_global_output = NULL;
+
+double logHistogram::binMax[] = {
+    1.e1l,  1.e2l,  1.e3l,  1.e4l,  1.e5l,  1.e6l,  1.e7l,  1.e8l,
+    1.e9l,  1.e10l, 1.e11l, 1.e12l, 1.e13l, 1.e14l, 1.e15l, 1.e16l,
+    1.e17l, 1.e18l, 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l,
+    1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l};
+
+/* ************* statistic member functions ************* */
+
+void statistic::addSample(double sample) {
+  sample -= offset;
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+
+  double delta = sample - meanVal;
+
+  sampleCount = sampleCount + 1;
+  meanVal = meanVal + delta / sampleCount;
+  m2 = m2 + delta * (sample - meanVal);
+
+  minVal = std::min(minVal, sample);
+  maxVal = std::max(maxVal, sample);
+  if (collectingHist)
+    hist.addSample(sample);
+}
+
+statistic &statistic::operator+=(const statistic &other) {
+  if (other.sampleCount == 0)
+    return *this;
+
+  if (sampleCount == 0) {
+    *this = other;
+    return *this;
+  }
+
+  uint64_t newSampleCount = sampleCount + other.sampleCount;
+  double dnsc = double(newSampleCount);
+  double dsc = double(sampleCount);
+  double dscBydnsc = dsc / dnsc;
+  double dosc = double(other.sampleCount);
+  double delta = other.meanVal - meanVal;
+
+  // Try to order these calculations to avoid overflows. If this were Fortran,
+  // then the compiler would not be able to re-order over brackets. In C++ it
+  // may be legal to do that (we certainly hope it doesn't, and CC+ Programming
+  // Language 2nd edition suggests it shouldn't, since it says that exploitation
+  // of associativity can only be made if the operation really is associative
+  // (which floating addition isn't...)).
+  meanVal = meanVal * dscBydnsc + other.meanVal * (1 - dscBydnsc);
+  m2 = m2 + other.m2 + dscBydnsc * dosc * delta * delta;
+  minVal = std::min(minVal, other.minVal);
+  maxVal = std::max(maxVal, other.maxVal);
+  sampleCount = newSampleCount;
+  if (collectingHist)
+    hist += other.hist;
+
+  return *this;
+}
+
+void statistic::scale(double factor) {
+  minVal = minVal * factor;
+  maxVal = maxVal * factor;
+  meanVal = meanVal * factor;
+  m2 = m2 * factor * factor;
+  return;
+}
+
+std::string statistic::format(char unit, bool total) const {
+  std::string result = formatSI(sampleCount, 9, ' ');
+
+  if (sampleCount == 0) {
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    if (total)
+      result = result + std::string(", ") + formatSI(0.0, 9, unit);
+    result = result + std::string(", ") + formatSI(0.0, 9, unit);
+  } else {
+    result = result + std::string(", ") + formatSI(minVal, 9, unit);
+    result = result + std::string(", ") + formatSI(meanVal, 9, unit);
+    result = result + std::string(", ") + formatSI(maxVal, 9, unit);
+    if (total)
+      result =
+          result + std::string(", ") + formatSI(meanVal * sampleCount, 9, unit);
+    result = result + std::string(", ") + formatSI(getSD(), 9, unit);
+  }
+  return result;
+}
+
+/* ************* histogram member functions ************* */
+
+// Lowest bin that has anything in it
+int logHistogram::minBin() const {
+  for (int i = 0; i < numBins; i++) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Highest bin that has anything in it
+int logHistogram::maxBin() const {
+  for (int i = numBins - 1; i >= 0; i--) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Which bin does this sample belong in ?
+uint32_t logHistogram::findBin(double sample) {
+  double v = std::fabs(sample);
+  // Simply loop up looking which bin to put it in.
+  // According to a micro-architect this is likely to be faster than a binary
+  // search, since
+  // it will only have one branch mis-predict
+  for (int b = 0; b < numBins; b++)
+    if (binMax[b] > v)
+      return b;
+  fprintf(stderr,
+          "Trying to add a sample that is too large into a histogram\n");
+  KMP_ASSERT(0);
+  return -1;
+}
+
+void logHistogram::addSample(double sample) {
+  if (sample == 0.0) {
+    zeroCount += 1;
+#ifdef KMP_DEBUG
+    _total++;
+    check();
+#endif
+    return;
+  }
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+  uint32_t bin = findBin(sample);
+  KMP_DEBUG_ASSERT(0 <= bin && bin < numBins);
+
+  bins[bin].count += 1;
+  bins[bin].total += sample;
+#ifdef KMP_DEBUG
+  _total++;
+  check();
+#endif
+}
+
+// This may not be the format we want, but it'll do for now
+std::string logHistogram::format(char unit) const {
+  std::stringstream result;
+
+  result << "Bin,                Count,     Total\n";
+  if (zeroCount) {
+    result << "0,              " << formatSI(zeroCount, 9, ' ') << ", ",
+        formatSI(0.0, 9, unit);
+    if (count(minBin()) == 0)
+      return result.str();
+    result << "\n";
+  }
+  for (int i = minBin(); i <= maxBin(); i++) {
+    result << "10**" << i << "<=v<10**" << (i + 1) << ", "
+           << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit);
+    if (i != maxBin())
+      result << "\n";
+  }
+
+  return result.str();
+}
+
+/* ************* explicitTimer member functions ************* */
+
+void explicitTimer::start(tsc_tick_count tick) {
+  startTime = tick;
+  totalPauseTime = 0;
+  if (timeStat::logEvent(timerEnumValue)) {
+    __kmp_stats_thread_ptr->incrementNestValue();
+  }
+  return;
+}
+
+void explicitTimer::stop(tsc_tick_count tick,
+                         kmp_stats_list *stats_ptr /* = nullptr */) {
+  if (startTime.getValue() == 0)
+    return;
+
+  stat->addSample(((tick - startTime) - totalPauseTime).ticks());
+
+  if (timeStat::logEvent(timerEnumValue)) {
+    if (!stats_ptr)
+      stats_ptr = __kmp_stats_thread_ptr;
+    stats_ptr->push_event(
+        startTime.getValue() - __kmp_stats_start_time.getValue(),
+        tick.getValue() - __kmp_stats_start_time.getValue(),
+        __kmp_stats_thread_ptr->getNestValue(), timerEnumValue);
+    stats_ptr->decrementNestValue();
+  }
+
+  /* We accept the risk that we drop a sample because it really did start at
+     t==0. */
+  startTime = 0;
+  return;
+}
+
+/* ************* partitionedTimers member functions ************* */
+partitionedTimers::partitionedTimers() { timer_stack.reserve(8); }
+
+// initialize the paritioned timers to an initial timer
+void partitionedTimers::init(explicitTimer timer) {
+  KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
+  timer_stack.push_back(timer);
+  timer_stack.back().start(tsc_tick_count::now());
+}
+
+// stop/save the current timer, and start the new timer (timer_pair)
+// There is a special condition where if the current timer is equal to
+// the one you are trying to push, then it only manipulates the stack,
+// and it won't stop/start the currently running timer.
+void partitionedTimers::push(explicitTimer timer) {
+  // get the current timer
+  // pause current timer
+  // push new timer
+  // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
+  KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
+  timer_stack.push_back(timer);
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 2]);
+  new_timer = &(timer_stack[stack_size - 1]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  current_timer->pause(tick);
+  new_timer->start(tick);
+}
+
+// stop/discard the current timer, and start the previously saved timer
+void partitionedTimers::pop() {
+  // get the current timer
+  // stop current timer (record event/sample)
+  // pop current timer
+  // get the new current timer and resume
+  explicitTimer *old_timer, *new_timer;
+  size_t stack_size = timer_stack.size();
+  KMP_DEBUG_ASSERT(stack_size > 1);
+  old_timer = &(timer_stack[stack_size - 1]);
+  new_timer = &(timer_stack[stack_size - 2]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  old_timer->stop(tick);
+  new_timer->resume(tick);
+  timer_stack.pop_back();
+}
+
+void partitionedTimers::exchange(explicitTimer timer) {
+  // get the current timer
+  // stop current timer (record event/sample)
+  // push new timer
+  // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
+  KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
+  tsc_tick_count tick = tsc_tick_count::now();
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 1]);
+  current_timer->stop(tick);
+  timer_stack.pop_back();
+  timer_stack.push_back(timer);
+  new_timer = &(timer_stack[stack_size - 1]);
+  new_timer->start(tick);
+}
+
+// Wind up all the currently running timers.
+// This pops off all the timers from the stack and clears the stack
+// After this is called, init() must be run again to initialize the
+// stack of timers
+void partitionedTimers::windup() {
+  while (timer_stack.size() > 1) {
+    this->pop();
+  }
+  // Pop the timer from the init() call
+  if (timer_stack.size() > 0) {
+    timer_stack.back().stop(tsc_tick_count::now());
+    timer_stack.pop_back();
+  }
+}
+
+/* ************* kmp_stats_event_vector member functions ************* */
+
+void kmp_stats_event_vector::deallocate() {
+  __kmp_free(events);
+  internal_size = 0;
+  allocated_size = 0;
+  events = NULL;
+}
+
+// This function is for qsort() which requires the compare function to return
+// either a negative number if event1 < event2, a positive number if event1 >
+// event2 or zero if event1 == event2. This sorts by start time (lowest to
+// highest).
+int compare_two_events(const void *event1, const void *event2) {
+  const kmp_stats_event *ev1 = RCAST(const kmp_stats_event *, event1);
+  const kmp_stats_event *ev2 = RCAST(const kmp_stats_event *, event2);
+
+  if (ev1->getStart() < ev2->getStart())
+    return -1;
+  else if (ev1->getStart() > ev2->getStart())
+    return 1;
+  else
+    return 0;
+}
+
+void kmp_stats_event_vector::sort() {
+  qsort(events, internal_size, sizeof(kmp_stats_event), compare_two_events);
+}
+
+/* ************* kmp_stats_list member functions ************* */
+
+// returns a pointer to newly created stats node
+kmp_stats_list *kmp_stats_list::push_back(int gtid) {
+  kmp_stats_list *newnode =
+      (kmp_stats_list *)__kmp_allocate(sizeof(kmp_stats_list));
+  // placement new, only requires space and pointer and initializes (so
+  // __kmp_allocate instead of C++ new[] is used)
+  new (newnode) kmp_stats_list();
+  newnode->setGtid(gtid);
+  newnode->prev = this->prev;
+  newnode->next = this;
+  newnode->prev->next = newnode;
+  newnode->next->prev = newnode;
+  return newnode;
+}
+void kmp_stats_list::deallocate() {
+  kmp_stats_list *ptr = this->next;
+  kmp_stats_list *delptr = this->next;
+  while (ptr != this) {
+    delptr = ptr;
+    ptr = ptr->next;
+    // placement new means we have to explicitly call destructor.
+    delptr->_event_vector.deallocate();
+    delptr->~kmp_stats_list();
+    __kmp_free(delptr);
+  }
+}
+kmp_stats_list::iterator kmp_stats_list::begin() {
+  kmp_stats_list::iterator it;
+  it.ptr = this->next;
+  return it;
+}
+kmp_stats_list::iterator kmp_stats_list::end() {
+  kmp_stats_list::iterator it;
+  it.ptr = this;
+  return it;
+}
+int kmp_stats_list::size() {
+  int retval;
+  kmp_stats_list::iterator it;
+  for (retval = 0, it = begin(); it != end(); it++, retval++) {
+  }
+  return retval;
+}
+
+/* ************* kmp_stats_list::iterator member functions ************* */
+
+kmp_stats_list::iterator::iterator() : ptr(NULL) {}
+kmp_stats_list::iterator::~iterator() {}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++() {
+  this->ptr = this->ptr->next;
+  return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++(int dummy) {
+  this->ptr = this->ptr->next;
+  return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--() {
+  this->ptr = this->ptr->prev;
+  return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--(int dummy) {
+  this->ptr = this->ptr->prev;
+  return *this;
+}
+bool kmp_stats_list::iterator::operator!=(const kmp_stats_list::iterator &rhs) {
+  return this->ptr != rhs.ptr;
+}
+bool kmp_stats_list::iterator::operator==(const kmp_stats_list::iterator &rhs) {
+  return this->ptr == rhs.ptr;
+}
+kmp_stats_list *kmp_stats_list::iterator::operator*() const {
+  return this->ptr;
+}
+
+/* *************  kmp_stats_output_module functions ************** */
+
+const char *kmp_stats_output_module::eventsFileName = NULL;
+const char *kmp_stats_output_module::plotFileName = NULL;
+int kmp_stats_output_module::printPerThreadFlag = 0;
+int kmp_stats_output_module::printPerThreadEventsFlag = 0;
+
+static char const *lastName(char *name) {
+  int l = strlen(name);
+  for (int i = l - 1; i >= 0; --i) {
+    if (name[i] == '.')
+      name[i] = '_';
+    if (name[i] == '/')
+      return name + i + 1;
+  }
+  return name;
+}
+
+/* Read the name of the executable from /proc/self/cmdline */
+static char const *getImageName(char *buffer, size_t buflen) {
+  FILE *f = fopen("/proc/self/cmdline", "r");
+  buffer[0] = char(0);
+  if (!f)
+    return buffer;
+
+  // The file contains char(0) delimited words from the commandline.
+  // This just returns the last filename component of the first word on the
+  // line.
+  size_t n = fread(buffer, 1, buflen, f);
+  if (n == 0) {
+    fclose(f);
+    KMP_CHECK_SYSFAIL("fread", 1)
+  }
+  fclose(f);
+  buffer[buflen - 1] = char(0);
+  return lastName(buffer);
+}
+
+static void getTime(char *buffer, size_t buflen, bool underscores = false) {
+  time_t timer;
+
+  time(&timer);
+
+  struct tm *tm_info = localtime(&timer);
+  if (underscores)
+    strftime(buffer, buflen, "%Y-%m-%d_%H%M%S", tm_info);
+  else
+    strftime(buffer, buflen, "%Y-%m-%d %H%M%S", tm_info);
+}
+
+/* Generate a stats file name, expanding prototypes */
+static std::string generateFilename(char const *prototype,
+                                    char const *imageName) {
+  std::string res;
+
+  for (int i = 0; prototype[i] != char(0); i++) {
+    char ch = prototype[i];
+
+    if (ch == '%') {
+      i++;
+      if (prototype[i] == char(0))
+        break;
+
+      switch (prototype[i]) {
+      case 't': // Insert time and date
+      {
+        char date[26];
+        getTime(date, sizeof(date), true);
+        res += date;
+      } break;
+      case 'e': // Insert executable name
+        res += imageName;
+        break;
+      case 'p': // Insert pid
+      {
+        std::stringstream ss;
+        ss << getpid();
+        res += ss.str();
+      } break;
+      default:
+        res += prototype[i];
+        break;
+      }
+    } else
+      res += ch;
+  }
+  return res;
+}
+
+// init() is called very near the beginning of execution time in the constructor
+// of __kmp_stats_global_output
+void kmp_stats_output_module::init() {
+
+  char *statsFileName = getenv("KMP_STATS_FILE");
+  eventsFileName = getenv("KMP_STATS_EVENTS_FILE");
+  plotFileName = getenv("KMP_STATS_PLOT_FILE");
+  char *threadStats = getenv("KMP_STATS_THREADS");
+  char *threadEvents = getenv("KMP_STATS_EVENTS");
+
+  // set the stats output filenames based on environment variables and defaults
+  if (statsFileName) {
+    char imageName[1024];
+    // Process any escapes (e.g., %p, %e, %t) in the name
+    outputFileName = generateFilename(
+        statsFileName, getImageName(&imageName[0], sizeof(imageName)));
+  }
+  eventsFileName = eventsFileName ? eventsFileName : "events.dat";
+  plotFileName = plotFileName ? plotFileName : "events.plt";
+
+  // set the flags based on environment variables matching: true, on, 1, .true.
+  // , .t. , yes
+  printPerThreadFlag = __kmp_str_match_true(threadStats);
+  printPerThreadEventsFlag = __kmp_str_match_true(threadEvents);
+
+  if (printPerThreadEventsFlag) {
+    // assigns a color to each timer for printing
+    setupEventColors();
+  } else {
+    // will clear flag so that no event will be logged
+    timeStat::clearEventFlags();
+  }
+}
+
+void kmp_stats_output_module::setupEventColors() {
+  int i;
+  int globalColorIndex = 0;
+  int numGlobalColors = sizeof(globalColorArray) / sizeof(rgb_color);
+  for (i = 0; i < TIMER_LAST; i++) {
+    if (timeStat::logEvent((timer_e)i)) {
+      timerColorInfo[i] = globalColorArray[globalColorIndex];
+      globalColorIndex = (globalColorIndex + 1) % numGlobalColors;
+    }
+  }
+}
+
+void kmp_stats_output_module::printTimerStats(FILE *statsOut,
+                                              statistic const *theStats,
+                                              statistic const *totalStats) {
+  fprintf(statsOut,
+          "Timer,                             SampleCount,    Min,      "
+          "Mean,       Max,     Total,        SD\n");
+  for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
+    statistic const *stat = &theStats[s];
+    char tag = timeStat::noUnits(s) ? ' ' : 'T';
+
+    fprintf(statsOut, "%-35s, %s\n", timeStat::name(s),
+            stat->format(tag, true).c_str());
+  }
+  // Also print the Total_ versions of times.
+  for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
+    char tag = timeStat::noUnits(s) ? ' ' : 'T';
+    if (totalStats && !timeStat::noTotal(s))
+      fprintf(statsOut, "Total_%-29s, %s\n", timeStat::name(s),
+              totalStats[s].format(tag, true).c_str());
+  }
+
+  // Print historgram of statistics
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nTimer distributions\n");
+    for (int s = 0; s < TIMER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T';
+
+        fprintf(statsOut, "%s\n", timeStat::name(timer_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(tag).c_str());
+      }
+    }
+  }
+}
+
+void kmp_stats_output_module::printCounterStats(FILE *statsOut,
+                                                statistic const *theStats) {
+  fprintf(statsOut, "Counter,                 ThreadCount,    Min,      Mean,  "
+                    "     Max,     Total,        SD\n");
+  for (int s = 0; s < COUNTER_LAST; s++) {
+    statistic const *stat = &theStats[s];
+    fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(s)),
+            stat->format(' ', true).c_str());
+  }
+  // Print histogram of counters
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nCounter distributions\n");
+    for (int s = 0; s < COUNTER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        fprintf(statsOut, "%s\n", counter::name(counter_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(' ').c_str());
+      }
+    }
+  }
+}
+
+void kmp_stats_output_module::printCounters(FILE *statsOut,
+                                            counter const *theCounters) {
+  // We print all the counters even if they are zero.
+  // That makes it easier to slice them into a spreadsheet if you need to.
+  fprintf(statsOut, "\nCounter,                    Count\n");
+  for (int c = 0; c < COUNTER_LAST; c++) {
+    counter const *stat = &theCounters[c];
+    fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(c)),
+            formatSI(stat->getValue(), 9, ' ').c_str());
+  }
+}
+
+void kmp_stats_output_module::printEvents(FILE *eventsOut,
+                                          kmp_stats_event_vector *theEvents,
+                                          int gtid) {
+  // sort by start time before printing
+  theEvents->sort();
+  for (int i = 0; i < theEvents->size(); i++) {
+    kmp_stats_event ev = theEvents->at(i);
+    rgb_color color = getEventColor(ev.getTimerName());
+    fprintf(eventsOut, "%d %llu %llu %1.1f rgb(%1.1f,%1.1f,%1.1f) %s\n", gtid,
+            static_cast<unsigned long long>(ev.getStart()),
+            static_cast<unsigned long long>(ev.getStop()),
+            1.2 - (ev.getNestLevel() * 0.2), color.r, color.g, color.b,
+            timeStat::name(ev.getTimerName()));
+  }
+  return;
+}
+
+void kmp_stats_output_module::windupExplicitTimers() {
+  // Wind up any explicit timers. We assume that it's fair at this point to just
+  // walk all the explcit timers in all threads and say "it's over".
+  // If the timer wasn't running, this won't record anything anyway.
+  kmp_stats_list::iterator it;
+  for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
+    kmp_stats_list *ptr = *it;
+    ptr->getPartitionedTimers()->windup();
+    ptr->endLife();
+  }
+}
+
+void kmp_stats_output_module::printPloticusFile() {
+  int i;
+  int size = __kmp_stats_list->size();
+  FILE *plotOut = fopen(plotFileName, "w+");
+
+  fprintf(plotOut, "#proc page\n"
+                   "   pagesize: 15 10\n"
+                   "   scale: 1.0\n\n");
+
+  fprintf(plotOut, "#proc getdata\n"
+                   "   file: %s\n\n",
+          eventsFileName);
+
+  fprintf(plotOut, "#proc areadef\n"
+                   "   title: OpenMP Sampling Timeline\n"
+                   "   titledetails: align=center size=16\n"
+                   "   rectangle: 1 1 13 9\n"
+                   "   xautorange: datafield=2,3\n"
+                   "   yautorange: -1 %d\n\n",
+          size);
+
+  fprintf(plotOut, "#proc xaxis\n"
+                   "   stubs: inc\n"
+                   "   stubdetails: size=12\n"
+                   "   label: Time (ticks)\n"
+                   "   labeldetails: size=14\n\n");
+
+  fprintf(plotOut, "#proc yaxis\n"
+                   "   stubs: inc 1\n"
+                   "   stubrange: 0 %d\n"
+                   "   stubdetails: size=12\n"
+                   "   label: Thread #\n"
+                   "   labeldetails: size=14\n\n",
+          size - 1);
+
+  fprintf(plotOut, "#proc bars\n"
+                   "   exactcolorfield: 5\n"
+                   "   axis: x\n"
+                   "   locfield: 1\n"
+                   "   segmentfields: 2 3\n"
+                   "   barwidthfield: 4\n\n");
+
+  // create legend entries corresponding to the timer color
+  for (i = 0; i < TIMER_LAST; i++) {
+    if (timeStat::logEvent((timer_e)i)) {
+      rgb_color c = getEventColor((timer_e)i);
+      fprintf(plotOut, "#proc legendentry\n"
+                       "   sampletype: color\n"
+                       "   label: %s\n"
+                       "   details: rgb(%1.1f,%1.1f,%1.1f)\n\n",
+              timeStat::name((timer_e)i), c.r, c.g, c.b);
+    }
+  }
+
+  fprintf(plotOut, "#proc legend\n"
+                   "   format: down\n"
+                   "   location: max max\n\n");
+  fclose(plotOut);
+  return;
+}
+
+static void outputEnvVariable(FILE *statsOut, char const *name) {
+  char const *value = getenv(name);
+  fprintf(statsOut, "# %s = %s\n", name, value ? value : "*unspecified*");
+}
+
+/* Print some useful information about
+   * the date and time this experiment ran.
+   * the machine on which it ran.
+   We output all of this as stylised comments, though we may decide to parse
+   some of it. */
+void kmp_stats_output_module::printHeaderInfo(FILE *statsOut) {
+  std::time_t now = std::time(0);
+  char buffer[40];
+  char hostName[80];
+
+  std::strftime(&buffer[0], sizeof(buffer), "%c", std::localtime(&now));
+  fprintf(statsOut, "# Time of run: %s\n", &buffer[0]);
+  if (gethostname(&hostName[0], sizeof(hostName)) == 0)
+    fprintf(statsOut, "# Hostname: %s\n", &hostName[0]);
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  fprintf(statsOut, "# CPU:  %s\n", &__kmp_cpuinfo.name[0]);
+  fprintf(statsOut, "# Family: %d, Model: %d, Stepping: %d\n",
+          __kmp_cpuinfo.family, __kmp_cpuinfo.model, __kmp_cpuinfo.stepping);
+  if (__kmp_cpuinfo.frequency == 0)
+    fprintf(statsOut, "# Nominal frequency: Unknown\n");
+  else
+    fprintf(statsOut, "# Nominal frequency: %sz\n",
+            formatSI(double(__kmp_cpuinfo.frequency), 9, 'H').c_str());
+  outputEnvVariable(statsOut, "KMP_HW_SUBSET");
+  outputEnvVariable(statsOut, "KMP_AFFINITY");
+  outputEnvVariable(statsOut, "KMP_BLOCKTIME");
+  outputEnvVariable(statsOut, "KMP_LIBRARY");
+  fprintf(statsOut, "# Production runtime built " __DATE__ " " __TIME__ "\n");
+#endif
+}
+
+void kmp_stats_output_module::outputStats(const char *heading) {
+  // Stop all the explicit timers in all threads
+  // Do this before declaring the local statistics because thay have
+  // constructors so will take time to create.
+  windupExplicitTimers();
+
+  statistic allStats[TIMER_LAST];
+  statistic totalStats[TIMER_LAST]; /* Synthesized, cross threads versions of
+                                       normal timer stats */
+  statistic allCounters[COUNTER_LAST];
+
+  FILE *statsOut =
+      !outputFileName.empty() ? fopen(outputFileName.c_str(), "a+") : stderr;
+  if (!statsOut)
+    statsOut = stderr;
+
+  FILE *eventsOut;
+  if (eventPrintingEnabled()) {
+    eventsOut = fopen(eventsFileName, "w+");
+  }
+
+  printHeaderInfo(statsOut);
+  fprintf(statsOut, "%s\n", heading);
+  // Accumulate across threads.
+  kmp_stats_list::iterator it;
+  for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
+    int t = (*it)->getGtid();
+    // Output per thread stats if requested.
+    if (printPerThreadFlag) {
+      fprintf(statsOut, "Thread %d\n", t);
+      printTimerStats(statsOut, (*it)->getTimers(), 0);
+      printCounters(statsOut, (*it)->getCounters());
+      fprintf(statsOut, "\n");
+    }
+    // Output per thread events if requested.
+    if (eventPrintingEnabled()) {
+      kmp_stats_event_vector events = (*it)->getEventVector();
+      printEvents(eventsOut, &events, t);
+    }
+
+    // Accumulate timers.
+    for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
+      // See if we should ignore this timer when aggregating
+      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on master
+          // and this thread is worker
+          (timeStat::workerOnly(s) && (t == 0)) // Timer only valid on worker
+          // and this thread is the master
+          ) {
+        continue;
+      }
+
+      statistic *threadStat = (*it)->getTimer(s);
+      allStats[s] += *threadStat;
+
+      // Add Total stats for timers that are valid in more than one thread
+      if (!timeStat::noTotal(s))
+        totalStats[s].addSample(threadStat->getTotal());
+    }
+
+    // Accumulate counters.
+    for (counter_e c = counter_e(0); c < COUNTER_LAST; c = counter_e(c + 1)) {
+      if (counter::masterOnly(c) && t != 0)
+        continue;
+      allCounters[c].addSample((*it)->getCounter(c)->getValue());
+    }
+  }
+
+  if (eventPrintingEnabled()) {
+    printPloticusFile();
+    fclose(eventsOut);
+  }
+
+  fprintf(statsOut, "Aggregate for all threads\n");
+  printTimerStats(statsOut, &allStats[0], &totalStats[0]);
+  fprintf(statsOut, "\n");
+  printCounterStats(statsOut, &allCounters[0]);
+
+  if (statsOut != stderr)
+    fclose(statsOut);
+}
+
+/* *************  exported C functions ************** */
+
+// no name mangling for these functions, we want the c files to be able to get
+// at these functions
+extern "C" {
+
+void __kmp_reset_stats() {
+  kmp_stats_list::iterator it;
+  for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
+    timeStat *timers = (*it)->getTimers();
+    counter *counters = (*it)->getCounters();
+
+    for (int t = 0; t < TIMER_LAST; t++)
+      timers[t].reset();
+
+    for (int c = 0; c < COUNTER_LAST; c++)
+      counters[c].reset();
+
+    // reset the event vector so all previous events are "erased"
+    (*it)->resetEventVector();
+  }
+}
+
+// This function will reset all stats and stop all threads' explicit timers if
+// they haven't been stopped already.
+void __kmp_output_stats(const char *heading) {
+  __kmp_stats_global_output->outputStats(heading);
+  __kmp_reset_stats();
+}
+
+void __kmp_accumulate_stats_at_exit(void) {
+  // Only do this once.
+  if (KMP_XCHG_FIXED32(&statsPrinted, 1) != 0)
+    return;
+
+  __kmp_output_stats("Statistics on exit");
+}
+
+void __kmp_stats_init(void) {
+  __kmp_init_tas_lock(&__kmp_stats_lock);
+  __kmp_stats_start_time = tsc_tick_count::now();
+  __kmp_stats_global_output = new kmp_stats_output_module();
+  __kmp_stats_list = new kmp_stats_list();
+}
+
+void __kmp_stats_fini(void) {
+  __kmp_accumulate_stats_at_exit();
+  __kmp_stats_list->deallocate();
+  delete __kmp_stats_global_output;
+  delete __kmp_stats_list;
+}
+
+} // extern "C"
diff --git a/final/runtime/src/kmp_stats.h b/final/runtime/src/kmp_stats.h
new file mode 100644
index 0000000..ee95658
--- /dev/null
+++ b/final/runtime/src/kmp_stats.h
@@ -0,0 +1,1010 @@
+#ifndef KMP_STATS_H
+#define KMP_STATS_H
+
+/** @file kmp_stats.h
+ * Functions for collecting statistics.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_config.h"
+#include "kmp_debug.h"
+
+#if KMP_STATS_ENABLED
+/* Statistics accumulator.
+   Accumulates number of samples and computes min, max, mean, standard deviation
+   on the fly.
+
+   Online variance calculation algorithm from
+   http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
+ */
+
+#include "kmp_stats_timing.h"
+#include <limits>
+#include <math.h>
+#include <new> // placement new
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+/* Enable developer statistics here if you want them. They are more detailed
+   than is useful for application characterisation and are intended for the
+   runtime library developer. */
+#define KMP_DEVELOPER_STATS 0
+
+/* Enable/Disable histogram output */
+#define KMP_STATS_HIST 0
+
+/*!
+ * @ingroup STATS_GATHERING
+ * \brief flags to describe the statistic (timer or counter)
+ *
+ */
+enum stats_flags_e {
+  noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
+  onlyInMaster = 1 << 1, //!< statistic is valid only for master
+  noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
+  notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
+  logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
+  //! KMP_STATS_EVENTS is on (valid only for timers)
+};
+
+/*!
+ * @ingroup STATS_GATHERING
+ * \brief the states which a thread can be in
+ *
+ */
+enum stats_state_e {
+  IDLE,
+  SERIAL_REGION,
+  FORK_JOIN_BARRIER,
+  PLAIN_BARRIER,
+  TASKWAIT,
+  TASKYIELD,
+  TASKGROUP,
+  IMPLICIT_TASK,
+  EXPLICIT_TASK,
+  TEAMS_REGION
+};
+
+/*!
+ * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments -
+ * macro(COUNTER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A counter counts the occurrence of some event. Each thread
+ * accumulates its own count, at the end of execution the counts are aggregated
+ * treating each thread as a separate measurement. (Unless onlyInMaster is set,
+ * in which case there's only a single measurement). The min,mean,max are
+ * therefore the values for the threads. Adding the counter here and then
+ * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you
+ * need to do. All of the tables and printing is generated from this macro.
+ * Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING
+ */
+// clang-format off
+#define KMP_FOREACH_COUNTER(macro, arg)                                        \
+  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
+  macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
+  macro(OMP_LOOP_STATIC, 0, arg)                                               \
+  macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
+  macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
+  macro(OMP_DISTRIBUTE, 0, arg)                                                \
+  macro(OMP_BARRIER, 0, arg)                                                   \
+  macro(OMP_CRITICAL, 0, arg)                                                  \
+  macro(OMP_SINGLE, 0, arg)                                                    \
+  macro(OMP_MASTER, 0, arg)                                                    \
+  macro(OMP_TEAMS, 0, arg)                                                     \
+  macro(OMP_set_lock, 0, arg)                                                  \
+  macro(OMP_test_lock, 0, arg)                                                 \
+  macro(REDUCE_wait, 0, arg)                                                   \
+  macro(REDUCE_nowait, 0, arg)                                                 \
+  macro(OMP_TASKYIELD, 0, arg)                                                 \
+  macro(OMP_TASKLOOP, 0, arg)                                                  \
+  macro(TASK_executed, 0, arg)                                                 \
+  macro(TASK_cancelled, 0, arg)                                                \
+  macro(TASK_stolen, 0, arg)
+// clang-format on
+
+/*!
+ * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments -
+ * macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A timer collects multiple samples of some count in each thread and
+ * then finally aggregates all of the samples from all of the threads. For most
+ * timers the printing code also provides an aggregation over the thread totals.
+ * These are printed as TOTAL_foo. The count is normally a time (in ticks),
+ * hence the name "timer". (But can be any value, so we use this for "number of
+ * arguments passed to fork" as well). For timers the threads are not
+ * significant, it's the individual observations that count, so the statistics
+ * are at that level. Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING2
+ */
+// clang-format off
+#define KMP_FOREACH_TIMER(macro, arg)                                          \
+  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
+  macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
+  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
+  macro (OMP_teams, stats_flags_e::logEvent, arg)                              \
+  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg)                     \
+  macro (OMP_loop_static, 0, arg)                                              \
+  macro (OMP_loop_static_scheduling, 0, arg)                                   \
+  macro (OMP_loop_dynamic, 0, arg)                                             \
+  macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
+  macro (OMP_distribute, 0, arg)                                               \
+  macro (OMP_distribute_scheduling, 0, arg)                                    \
+  macro (OMP_critical, 0, arg)                                                 \
+  macro (OMP_critical_wait, 0, arg)                                            \
+  macro (OMP_single, 0, arg)                                                   \
+  macro (OMP_master, 0, arg)                                                   \
+  macro (OMP_task_immediate, 0, arg)                                           \
+  macro (OMP_task_taskwait, 0, arg)                                            \
+  macro (OMP_task_taskyield, 0, arg)                                           \
+  macro (OMP_task_taskgroup, 0, arg)                                           \
+  macro (OMP_task_join_bar, 0, arg)                                            \
+  macro (OMP_task_plain_bar, 0, arg)                                           \
+  macro (OMP_taskloop_scheduling, 0, arg)                                      \
+  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
+  macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
+  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
+  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
+         arg)                                                                  \
+  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
+         arg)                                                                  \
+  macro (OMP_loop_static_iterations,                                           \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_static_total_iterations,                                     \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_dynamic_iterations,                                          \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_dynamic_total_iterations,                                    \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_distribute_iterations,                                            \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
+// clang-format on
+
+// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
+//                           initializing OpenMP or being created by a master)
+//                           until the thread is destroyed
+// OMP_parallel           -- Time thread spends executing work directly
+//                           within a #pragma omp parallel
+// OMP_parallel_overhead  -- Time thread spends setting up a parallel region
+// OMP_loop_static        -- Time thread spends executing loop iterations from
+//                           a statically scheduled loop
+// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
+//                               from a statically scheduled loop
+// OMP_loop_dynamic       -- Time thread spends executing loop iterations from
+//                           a dynamically scheduled loop
+// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
+//                                from a dynamically scheduled loop
+// OMP_critical           -- Time thread spends executing critical section
+// OMP_critical_wait      -- Time thread spends waiting to enter
+//                           a critcal seciton
+// OMP_single             -- Time spent executing a "single" region
+// OMP_master             -- Time spent executing a "master" region
+// OMP_task_immediate     -- Time spent executing non-deferred tasks
+// OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
+//                           construct
+// OMP_task_taskyield     -- Time spent executing tasks inside a taskyield
+//                           construct
+// OMP_task_taskgroup     -- Time spent executing tasks inside a taskygroup
+//                           construct
+// OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
+// OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
+//                           construct
+// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
+//                            construct
+// OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
+//                           inside implicit barrier at end of worksharing
+//                           construct
+// OMP_idle               -- Time worker threads spend waiting for next
+//                           parallel region
+// OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
+//                           parallel region
+// OMP_join_barrier       -- Time spent in a the join barrier surrounding a
+//                           parallel region
+// OMP_serial             -- Time thread zero spends executing serial code
+// OMP_set_numthreads     -- Values passed to omp_set_num_threads
+// OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
+// OMP_loop_static_iterations -- Number of iterations thread is assigned for
+//                               statically scheduled loops
+// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
+//                                dynamically scheduled loops
+
+#if (KMP_DEVELOPER_STATS)
+// Timers which are of interest to runtime library developers, not end users.
+// These have to be explicitly enabled in addition to the other stats.
+
+// KMP_fork_barrier       -- time in __kmp_fork_barrier
+// KMP_join_barrier       -- time in __kmp_join_barrier
+// KMP_barrier            -- time in __kmp_barrier
+// KMP_end_split_barrier  -- time in __kmp_end_split_barrier
+// KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
+// KMP_icv_copy           -- start/stop timer for any ICV copying
+// KMP_linear_gather      -- time in __kmp_linear_barrier_gather
+// KMP_linear_release     -- time in __kmp_linear_barrier_release
+// KMP_tree_gather        -- time in __kmp_tree_barrier_gather
+// KMP_tree_release       -- time in __kmp_tree_barrier_release
+// KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
+// KMP_hyper_release      -- time in __kmp_hyper_barrier_release
+// clang-format off
+#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
+  macro(KMP_fork_call, 0, arg)                                                 \
+  macro(KMP_join_call, 0, arg)                                                 \
+  macro(KMP_end_split_barrier, 0, arg)                                         \
+  macro(KMP_hier_gather, 0, arg)                                               \
+  macro(KMP_hier_release, 0, arg)                                              \
+  macro(KMP_hyper_gather, 0, arg)                                              \
+  macro(KMP_hyper_release, 0, arg)                                             \
+  macro(KMP_linear_gather, 0, arg)                                             \
+  macro(KMP_linear_release, 0, arg)                                            \
+  macro(KMP_tree_gather, 0, arg)                                               \
+  macro(KMP_tree_release, 0, arg)                                              \
+  macro(USER_resume, 0, arg)                                                   \
+  macro(USER_suspend, 0, arg)                                                  \
+  macro(KMP_allocate_team, 0, arg)                                             \
+  macro(KMP_setup_icv_copy, 0, arg)                                            \
+  macro(USER_icv_copy, 0, arg)                                                 \
+  macro (FOR_static_steal_stolen,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (FOR_static_steal_chunks,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
+#else
+#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
+#endif
+// clang-format on
+
+/*!
+ * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
+ *
+ * @param macro a user defined macro that takes three arguments -
+ * macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE
+ * BAD THINGS WILL HAPPEN!
+ *
+ * \details Explicit timers are ones where we need to allocate a timer itself
+ * (as well as the accumulated timing statistics). We allocate these on a
+ * per-thread basis, and explicitly start and stop them. Block timers just
+ * allocate the timer itself on the stack, and use the destructor to notice
+ * block exit; they don't need to be defined here. The name here should be the
+ * same as that of a timer above.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
+
+#define ENUMERATE(name, ignore, prefix) prefix##name,
+enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
+
+enum explicit_timer_e {
+  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
+};
+
+enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
+#undef ENUMERATE
+
+/*
+ * A logarithmic histogram. It accumulates the number of values in each power of
+ * ten bin.  So 1<=x<10, 10<=x<100, ...
+ * Mostly useful where we have some big outliers and want to see information
+ * about them.
+ */
+class logHistogram {
+  enum {
+    numBins = 31, /* Number of powers of 10. If this changes you need to change
+                   * the initializer for binMax */
+
+    /*
+     * If you want to use this to analyse values that may be less than 1, (for
+     * instance times in s), then the logOffset gives you negative powers.
+     * In our case here, we're just looking at times in ticks, or counts, so we
+     * can never see values with magnitude < 1 (other than zero), so we can set
+     * it to 0.  As above change the initializer if you change this.
+     */
+    logOffset = 0
+  };
+  uint32_t KMP_ALIGN_CACHE zeroCount;
+  struct {
+    uint32_t count;
+    double total;
+  } bins[numBins];
+
+  static double binMax[numBins];
+
+#ifdef KMP_DEBUG
+  uint64_t _total;
+
+  void check() const {
+    uint64_t t = zeroCount;
+    for (int i = 0; i < numBins; i++)
+      t += bins[i].count;
+    KMP_DEBUG_ASSERT(t == _total);
+  }
+#else
+  void check() const {}
+#endif
+
+public:
+  logHistogram() { reset(); }
+
+  logHistogram(logHistogram const &o) {
+    for (int i = 0; i < numBins; i++)
+      bins[i] = o.bins[i];
+#ifdef KMP_DEBUG
+    _total = o._total;
+#endif
+  }
+
+  void reset() {
+    zeroCount = 0;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count = 0;
+      bins[i].total = 0;
+    }
+
+#ifdef KMP_DEBUG
+    _total = 0;
+#endif
+  }
+  uint32_t count(int b) const { return bins[b + logOffset].count; }
+  double total(int b) const { return bins[b + logOffset].total; }
+  static uint32_t findBin(double sample);
+
+  logHistogram &operator+=(logHistogram const &o) {
+    zeroCount += o.zeroCount;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count += o.bins[i].count;
+      bins[i].total += o.bins[i].total;
+    }
+#ifdef KMP_DEBUG
+    _total += o._total;
+    check();
+#endif
+
+    return *this;
+  }
+
+  void addSample(double sample);
+  int minBin() const;
+  int maxBin() const;
+
+  std::string format(char) const;
+};
+
+class statistic {
+  double KMP_ALIGN_CACHE minVal;
+  double maxVal;
+  double meanVal;
+  double m2;
+  uint64_t sampleCount;
+  double offset;
+  bool collectingHist;
+  logHistogram hist;
+
+public:
+  statistic(bool doHist = bool(KMP_STATS_HIST)) {
+    reset();
+    collectingHist = doHist;
+  }
+  statistic(statistic const &o)
+      : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
+        sampleCount(o.sampleCount), offset(o.offset),
+        collectingHist(o.collectingHist), hist(o.hist) {}
+  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
+      : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
+        sampleCount(sc), offset(0.0), collectingHist(false) {}
+  bool haveHist() const { return collectingHist; }
+  double getMin() const { return minVal; }
+  double getMean() const { return meanVal; }
+  double getMax() const { return maxVal; }
+  uint64_t getCount() const { return sampleCount; }
+  double getSD() const { return sqrt(m2 / sampleCount); }
+  double getTotal() const { return sampleCount * meanVal; }
+  logHistogram const *getHist() const { return &hist; }
+  void setOffset(double d) { offset = d; }
+
+  void reset() {
+    minVal = std::numeric_limits<double>::max();
+    maxVal = -minVal;
+    meanVal = 0.0;
+    m2 = 0.0;
+    sampleCount = 0;
+    offset = 0.0;
+    hist.reset();
+  }
+  void addSample(double sample);
+  void scale(double factor);
+  void scaleDown(double f) { scale(1. / f); }
+  void forceCount(uint64_t count) { sampleCount = count; }
+  statistic &operator+=(statistic const &other);
+
+  std::string format(char unit, bool total = false) const;
+  std::string formatHist(char unit) const { return hist.format(unit); }
+};
+
+struct statInfo {
+  const char *name;
+  uint32_t flags;
+};
+
+class timeStat : public statistic {
+  static statInfo timerInfo[];
+
+public:
+  timeStat() : statistic() {}
+  static const char *name(timer_e e) { return timerInfo[e].name; }
+  static bool noTotal(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::noTotal;
+  }
+  static bool masterOnly(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::onlyInMaster;
+  }
+  static bool workerOnly(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::notInMaster;
+  }
+  static bool noUnits(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::noUnits;
+  }
+  static bool logEvent(timer_e e) {
+    return timerInfo[e].flags & stats_flags_e::logEvent;
+  }
+  static void clearEventFlags() {
+    for (int i = 0; i < TIMER_LAST; i++) {
+      timerInfo[i].flags &= (~(stats_flags_e::logEvent));
+    }
+  }
+};
+
+// Where we need explicitly to start and end the timer, this version can be used
+// Since these timers normally aren't nicely scoped, so don't have a good place
+// to live on the stack of the thread, they're more work to use.
+class explicitTimer {
+  timeStat *stat;
+  timer_e timerEnumValue;
+  tsc_tick_count startTime;
+  tsc_tick_count pauseStartTime;
+  tsc_tick_count::tsc_interval_t totalPauseTime;
+
+public:
+  explicitTimer(timeStat *s, timer_e te)
+      : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
+        totalPauseTime() {}
+
+  // void setStat(timeStat *s) { stat = s; }
+  void start(tsc_tick_count tick);
+  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
+  void resume(tsc_tick_count tick) {
+    totalPauseTime += (tick - pauseStartTime);
+  }
+  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
+  void reset() {
+    startTime = 0;
+    pauseStartTime = 0;
+    totalPauseTime = 0;
+  }
+  timer_e get_type() const { return timerEnumValue; }
+};
+
+// Where you need to partition a threads clock ticks into separate states
+// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
+// DOING_NOTHING would render these conditions:
+// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
+// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
+// versa
+class partitionedTimers {
+private:
+  std::vector<explicitTimer> timer_stack;
+
+public:
+  partitionedTimers();
+  void init(explicitTimer timer);
+  void exchange(explicitTimer timer);
+  void push(explicitTimer timer);
+  void pop();
+  void windup();
+};
+
+// Special wrapper around the partioned timers to aid timing code blocks
+// It avoids the need to have an explicit end, leaving the scope suffices.
+class blockPartitionedTimer {
+  partitionedTimers *part_timers;
+
+public:
+  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
+      : part_timers(pt) {
+    part_timers->push(timer);
+  }
+  ~blockPartitionedTimer() { part_timers->pop(); }
+};
+
+// Special wrapper around the thread state to aid in keeping state in code
+// blocks It avoids the need to have an explicit end, leaving the scope
+// suffices.
+class blockThreadState {
+  stats_state_e *state_pointer;
+  stats_state_e old_state;
+
+public:
+  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
+      : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
+    *state_pointer = new_state;
+  }
+  ~blockThreadState() { *state_pointer = old_state; }
+};
+
+// If all you want is a count, then you can use this...
+// The individual per-thread counts will be aggregated into a statistic at
+// program exit.
+class counter {
+  uint64_t value;
+  static const statInfo counterInfo[];
+
+public:
+  counter() : value(0) {}
+  void increment() { value++; }
+  uint64_t getValue() const { return value; }
+  void reset() { value = 0; }
+  static const char *name(counter_e e) { return counterInfo[e].name; }
+  static bool masterOnly(counter_e e) {
+    return counterInfo[e].flags & stats_flags_e::onlyInMaster;
+  }
+};
+
+/* ****************************************************************
+    Class to implement an event
+
+    There are four components to an event: start time, stop time
+    nest_level, and timer_name.
+    The start and stop time should be obvious (recorded in clock ticks).
+    The nest_level relates to the bar width in the timeline graph.
+    The timer_name is used to determine which timer event triggered this event.
+
+    the interface to this class is through four read-only operations:
+    1) getStart()     -- returns the start time as 64 bit integer
+    2) getStop()      -- returns the stop time as 64 bit integer
+    3) getNestLevel() -- returns the nest level of the event
+    4) getTimerName() -- returns the timer name that triggered event
+
+    *MORE ON NEST_LEVEL*
+    The nest level is used in the bar graph that represents the timeline.
+    Its main purpose is for showing how events are nested inside eachother.
+    For example, say events, A, B, and C are recorded.  If the timeline
+    looks like this:
+
+Begin -------------------------------------------------------------> Time
+         |    |          |        |          |              |
+         A    B          C        C          B              A
+       start start     start     end        end            end
+
+       Then A, B, C will have a nest level of 1, 2, 3 respectively.
+       These values are then used to calculate the barwidth so you can
+       see that inside A, B has occurred, and inside B, C has occurred.
+       Currently, this is shown with A's bar width being larger than B's
+       bar width, and B's bar width being larger than C's bar width.
+
+**************************************************************** */
+class kmp_stats_event {
+  uint64_t start;
+  uint64_t stop;
+  int nest_level;
+  timer_e timer_name;
+
+public:
+  kmp_stats_event()
+      : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
+  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
+      : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
+  inline uint64_t getStart() const { return start; }
+  inline uint64_t getStop() const { return stop; }
+  inline int getNestLevel() const { return nest_level; }
+  inline timer_e getTimerName() const { return timer_name; }
+};
+
+/* ****************************************************************
+    Class to implement a dynamically expandable array of events
+
+    ---------------------------------------------------------
+    | event 1 | event 2 | event 3 | event 4 | ... | event N |
+    ---------------------------------------------------------
+
+    An event is pushed onto the back of this array at every
+    explicitTimer->stop() call.  The event records the thread #,
+    start time, stop time, and nest level related to the bar width.
+
+    The event vector starts at size INIT_SIZE and grows (doubles in size)
+    if needed.  An implication of this behavior is that log(N)
+    reallocations are needed (where N is number of events).  If you want
+    to avoid reallocations, then set INIT_SIZE to a large value.
+
+    the interface to this class is through six operations:
+    1) reset() -- sets the internal_size back to 0 but does not deallocate any
+       memory
+    2) size()  -- returns the number of valid elements in the vector
+    3) push_back(start, stop, nest, timer_name) -- pushes an event onto
+       the back of the array
+    4) deallocate() -- frees all memory associated with the vector
+    5) sort() -- sorts the vector by start time
+    6) operator[index] or at(index) -- returns event reference at that index
+**************************************************************** */
+class kmp_stats_event_vector {
+  kmp_stats_event *events;
+  int internal_size;
+  int allocated_size;
+  static const int INIT_SIZE = 1024;
+
+public:
+  kmp_stats_event_vector() {
+    events =
+        (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
+    internal_size = 0;
+    allocated_size = INIT_SIZE;
+  }
+  ~kmp_stats_event_vector() {}
+  inline void reset() { internal_size = 0; }
+  inline int size() const { return internal_size; }
+  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
+                 timer_e name) {
+    int i;
+    if (internal_size == allocated_size) {
+      kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
+          sizeof(kmp_stats_event) * allocated_size * 2);
+      for (i = 0; i < internal_size; i++)
+        tmp[i] = events[i];
+      __kmp_free(events);
+      events = tmp;
+      allocated_size *= 2;
+    }
+    events[internal_size] =
+        kmp_stats_event(start_time, stop_time, nest_level, name);
+    internal_size++;
+    return;
+  }
+  void deallocate();
+  void sort();
+  const kmp_stats_event &operator[](int index) const { return events[index]; }
+  kmp_stats_event &operator[](int index) { return events[index]; }
+  const kmp_stats_event &at(int index) const { return events[index]; }
+  kmp_stats_event &at(int index) { return events[index]; }
+};
+
+/* ****************************************************************
+    Class to implement a doubly-linked, circular, statistics list
+
+    |---| ---> |---| ---> |---| ---> |---| ---> ... next
+    |   |      |   |      |   |      |   |
+    |---| <--- |---| <--- |---| <--- |---| <--- ... prev
+    Sentinel   first      second     third
+    Node       node       node       node
+
+    The Sentinel Node is the user handle on the list.
+    The first node corresponds to thread 0's statistics.
+    The second node corresponds to thread 1's statistics and so on...
+
+    Each node has a _timers, _counters, and _explicitTimers array to hold that
+    thread's statistics. The _explicitTimers point to the correct _timer and
+    update its statistics at every stop() call. The explicitTimers' pointers are
+    set up in the constructor. Each node also has an event vector to hold that
+    thread's timing events. The event vector expands as necessary and records
+    the start-stop times for each timer.
+
+    The nestLevel variable is for plotting events and is related
+    to the bar width in the timeline graph.
+
+    Every thread will have a thread local pointer to its node in
+    the list.  The sentinel node is used by the master thread to
+    store "dummy" statistics before __kmp_create_worker() is called.
+**************************************************************** */
+class kmp_stats_list {
+  int gtid;
+  timeStat _timers[TIMER_LAST + 1];
+  counter _counters[COUNTER_LAST + 1];
+  explicitTimer thread_life_timer;
+  partitionedTimers _partitionedTimers;
+  int _nestLevel; // one per thread
+  kmp_stats_event_vector _event_vector;
+  kmp_stats_list *next;
+  kmp_stats_list *prev;
+  stats_state_e state;
+  int thread_is_idle_flag;
+
+public:
+  kmp_stats_list()
+      : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
+                          TIMER_OMP_worker_thread_life),
+        _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
+        thread_is_idle_flag(0) {}
+  ~kmp_stats_list() {}
+  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
+  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
+  inline partitionedTimers *getPartitionedTimers() {
+    return &_partitionedTimers;
+  }
+  inline timeStat *getTimers() { return _timers; }
+  inline counter *getCounters() { return _counters; }
+  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
+  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
+  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
+  inline void resetEventVector() { _event_vector.reset(); }
+  inline void incrementNestValue() { _nestLevel++; }
+  inline int getNestValue() { return _nestLevel; }
+  inline void decrementNestValue() { _nestLevel--; }
+  inline int getGtid() const { return gtid; }
+  inline void setGtid(int newgtid) { gtid = newgtid; }
+  inline void setState(stats_state_e newstate) { state = newstate; }
+  inline stats_state_e getState() const { return state; }
+  inline stats_state_e *getStatePointer() { return &state; }
+  inline bool isIdle() { return thread_is_idle_flag == 1; }
+  inline void setIdleFlag() { thread_is_idle_flag = 1; }
+  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
+  kmp_stats_list *push_back(int gtid); // returns newly created list node
+  inline void push_event(uint64_t start_time, uint64_t stop_time,
+                         int nest_level, timer_e name) {
+    _event_vector.push_back(start_time, stop_time, nest_level, name);
+  }
+  void deallocate();
+  class iterator;
+  kmp_stats_list::iterator begin();
+  kmp_stats_list::iterator end();
+  int size();
+  class iterator {
+    kmp_stats_list *ptr;
+    friend kmp_stats_list::iterator kmp_stats_list::begin();
+    friend kmp_stats_list::iterator kmp_stats_list::end();
+
+  public:
+    iterator();
+    ~iterator();
+    iterator operator++();
+    iterator operator++(int dummy);
+    iterator operator--();
+    iterator operator--(int dummy);
+    bool operator!=(const iterator &rhs);
+    bool operator==(const iterator &rhs);
+    kmp_stats_list *operator*() const; // dereference operator
+  };
+};
+
+/* ****************************************************************
+   Class to encapsulate all output functions and the environment variables
+
+   This module holds filenames for various outputs (normal stats, events, plot
+   file), as well as coloring information for the plot file.
+
+   The filenames and flags variables are read from environment variables.
+   These are read once by the constructor of the global variable
+   __kmp_stats_output which calls init().
+
+   During this init() call, event flags for the timeStat::timerInfo[] global
+   array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
+
+   The only interface function that is public is outputStats(heading).  This
+   function should print out everything it needs to, either to files or stderr,
+   depending on the environment variables described below
+
+   ENVIRONMENT VARIABLES:
+   KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
+                     file, otherwise, print to stderr
+   KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
+                        either KMP_STATS_FILE or stderr
+   KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
+                          otherwise, the plot file is sent to "events.plt"
+   KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
+                       events
+   KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
+                            otherwise, output is sent to "events.dat"
+**************************************************************** */
+class kmp_stats_output_module {
+
+public:
+  struct rgb_color {
+    float r;
+    float g;
+    float b;
+  };
+
+private:
+  std::string outputFileName;
+  static const char *eventsFileName;
+  static const char *plotFileName;
+  static int printPerThreadFlag;
+  static int printPerThreadEventsFlag;
+  static const rgb_color globalColorArray[];
+  static rgb_color timerColorInfo[];
+
+  void init();
+  static void setupEventColors();
+  static void printPloticusFile();
+  static void printHeaderInfo(FILE *statsOut);
+  static void printTimerStats(FILE *statsOut, statistic const *theStats,
+                              statistic const *totalStats);
+  static void printCounterStats(FILE *statsOut, statistic const *theStats);
+  static void printCounters(FILE *statsOut, counter const *theCounters);
+  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
+                          int gtid);
+  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
+  static void windupExplicitTimers();
+  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
+
+public:
+  kmp_stats_output_module() { init(); }
+  void outputStats(const char *heading);
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __kmp_stats_init();
+void __kmp_stats_fini();
+void __kmp_reset_stats();
+void __kmp_output_stats(const char *);
+void __kmp_accumulate_stats_at_exit(void);
+// thread local pointer to stats node within list
+extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
+// head to stats list.
+extern kmp_stats_list *__kmp_stats_list;
+// lock for __kmp_stats_list
+extern kmp_tas_lock_t __kmp_stats_lock;
+// reference start time
+extern tsc_tick_count __kmp_stats_start_time;
+// interface to output
+extern kmp_stats_output_module __kmp_stats_output;
+
+#ifdef __cplusplus
+}
+#endif
+
+// Simple, standard interfaces that drop out completely if stats aren't enabled
+
+/*!
+ * \brief Adds value to specified timer (name).
+ *
+ * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
+ * @param value double precision sample value to add to statistics for the timer
+ *
+ * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to
+ * a timer statistics.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_COUNT_VALUE(name, value)                                           \
+  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
+
+/*!
+ * \brief Increments specified counter (name).
+ *
+ * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
+ *
+ * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics
+ * counter for the executing thread.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_COUNT_BLOCK(name)                                                  \
+  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
+
+/*!
+ * \brief Outputs the current thread statistics and reset them.
+ *
+ * @param heading_string heading put above the final stats output
+ *
+ * \details Explicitly stops all timers and outputs all stats. Environment
+ * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a
+ * filename instead of stderr. Environment variable,
+ * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific
+ * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be
+ * defined with any value, which will print out thread specific stats, or it can
+ * be undefined (not specified in the environment) and thread specific stats
+ * won't be printed. It should be noted that all statistics are reset when this
+ * macro is called.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
+
+/*!
+ * \brief Initializes the paritioned timers to begin with name.
+ *
+ * @param name timer which you want this thread to begin with
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
+#define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
+  blockPartitionedTimer __PBLOCKTIME__(                                        \
+      __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
+      explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
+                    TIMER_##name))
+
+#define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
+#define KMP_POP_PARTITIONED_TIMER()                                            \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
+
+#define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
+#define KMP_SET_THREAD_STATE(state_name)                                       \
+  __kmp_stats_thread_ptr->setState(state_name)
+
+#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
+
+#define KMP_SET_THREAD_STATE_BLOCK(state_name)                                 \
+  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
+                                    state_name)
+
+/*!
+ * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
+ *
+ * \details Reset all stats for all threads.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_RESET_STATS() __kmp_reset_stats()
+
+#if (KMP_DEVELOPER_STATS)
+#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
+#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
+#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
+#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
+#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
+#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n)                            \
+  KMP_EXCHANGE_PARTITIONED_TIMER(n)
+#else
+// Null definitions
+#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
+#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
+#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
+#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#endif
+
+#else // KMP_STATS_ENABLED
+
+// Null definitions
+#define KMP_COUNT_VALUE(n, v) ((void)0)
+#define KMP_COUNT_BLOCK(n) ((void)0)
+
+#define KMP_OUTPUT_STATS(heading_string) ((void)0)
+#define KMP_RESET_STATS() ((void)0)
+
+#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
+#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
+#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
+#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
+#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
+#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
+#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
+#define KMP_POP_PARTITIONED_TIMER() ((void)0)
+#define KMP_SET_THREAD_STATE(state_name) ((void)0)
+#define KMP_GET_THREAD_STATE() ((void)0)
+#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
+#endif // KMP_STATS_ENABLED
+
+#endif // KMP_STATS_H
diff --git a/final/runtime/src/kmp_stats_timing.cpp b/final/runtime/src/kmp_stats_timing.cpp
new file mode 100644
index 0000000..bdfe68c
--- /dev/null
+++ b/final/runtime/src/kmp_stats_timing.cpp
@@ -0,0 +1,130 @@
+/** @file kmp_stats_timing.cpp
+ * Timing functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "kmp.h"
+#include "kmp_stats_timing.h"
+
+using namespace std;
+
+#if KMP_HAVE_TICK_TIME
+#if KMP_MIC
+double tsc_tick_count::tick_time() {
+  // pretty bad assumption of 1GHz clock for MIC
+  return 1 / ((double)1000 * 1.e6);
+}
+#elif KMP_ARCH_X86 || KMP_ARCH_X86_64
+#include <string.h>
+// Extract the value from the CPUID information
+double tsc_tick_count::tick_time() {
+  static double result = 0.0;
+
+  if (result == 0.0) {
+    kmp_cpuid_t cpuinfo;
+    char brand[256];
+
+    __kmp_x86_cpuid(0x80000000, 0, &cpuinfo);
+    memset(brand, 0, sizeof(brand));
+    int ids = cpuinfo.eax;
+
+    for (unsigned int i = 2; i < (ids ^ 0x80000000) + 2; i++)
+      __kmp_x86_cpuid(i | 0x80000000, 0,
+                      (kmp_cpuid_t *)(brand + (i - 2) * sizeof(kmp_cpuid_t)));
+
+    char *start = &brand[0];
+    for (; *start == ' '; start++)
+      ;
+
+    char *end = brand + KMP_STRLEN(brand) - 3;
+    uint64_t multiplier;
+
+    if (*end == 'M')
+      multiplier = 1000LL * 1000LL;
+    else if (*end == 'G')
+      multiplier = 1000LL * 1000LL * 1000LL;
+    else if (*end == 'T')
+      multiplier = 1000LL * 1000LL * 1000LL * 1000LL;
+    else {
+      cout << "Error determining multiplier '" << *end << "'\n";
+      exit(-1);
+    }
+    *end = 0;
+    while (*end != ' ')
+      end--;
+    end++;
+
+    double freq = strtod(end, &start);
+    if (freq == 0.0) {
+      cout << "Error calculating frequency " << end << "\n";
+      exit(-1);
+    }
+
+    result = ((double)1.0) / (freq * multiplier);
+  }
+  return result;
+}
+#endif
+#endif
+
+static bool useSI = true;
+
+// Return a formatted string after normalising the value into
+// engineering style and using a suitable unit prefix (e.g. ms, us, ns).
+std::string formatSI(double interval, int width, char unit) {
+  std::stringstream os;
+
+  if (useSI) {
+    // Preserve accuracy for small numbers, since we only multiply and the
+    // positive powers of ten are precisely representable.
+    static struct {
+      double scale;
+      char prefix;
+    } ranges[] = {{1.e21, 'y'},  {1.e18, 'z'},  {1.e15, 'a'},  {1.e12, 'f'},
+                  {1.e9, 'p'},   {1.e6, 'n'},   {1.e3, 'u'},   {1.0, 'm'},
+                  {1.e-3, ' '},  {1.e-6, 'k'},  {1.e-9, 'M'},  {1.e-12, 'G'},
+                  {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'}, {1.e-24, 'Z'},
+                  {1.e-27, 'Y'}};
+
+    if (interval == 0.0) {
+      os << std::setw(width - 3) << std::right << "0.00" << std::setw(3)
+         << unit;
+      return os.str();
+    }
+
+    bool negative = false;
+    if (interval < 0.0) {
+      negative = true;
+      interval = -interval;
+    }
+
+    for (int i = 0; i < (int)(sizeof(ranges) / sizeof(ranges[0])); i++) {
+      if (interval * ranges[i].scale < 1.e0) {
+        interval = interval * 1000.e0 * ranges[i].scale;
+        os << std::fixed << std::setprecision(2) << std::setw(width - 3)
+           << std::right << (negative ? -interval : interval) << std::setw(2)
+           << ranges[i].prefix << std::setw(1) << unit;
+
+        return os.str();
+      }
+    }
+  }
+  os << std::setprecision(2) << std::fixed << std::right << std::setw(width - 3)
+     << interval << std::setw(3) << unit;
+
+  return os.str();
+}
diff --git a/final/runtime/src/kmp_stats_timing.h b/final/runtime/src/kmp_stats_timing.h
new file mode 100644
index 0000000..f3428b3
--- /dev/null
+++ b/final/runtime/src/kmp_stats_timing.h
@@ -0,0 +1,115 @@
+#ifndef KMP_STATS_TIMING_H
+#define KMP_STATS_TIMING_H
+
+/** @file kmp_stats_timing.h
+ * Access to real time clock and timers.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_os.h"
+#include <limits>
+#include <stdint.h>
+#include <string>
+#if KMP_HAVE_X86INTRIN_H
+#include <x86intrin.h>
+#endif
+
+class tsc_tick_count {
+private:
+  int64_t my_count;
+
+public:
+  class tsc_interval_t {
+    int64_t value;
+    explicit tsc_interval_t(int64_t _value) : value(_value) {}
+
+  public:
+    tsc_interval_t() : value(0) {} // Construct 0 time duration
+#if KMP_HAVE_TICK_TIME
+    double seconds() const; // Return the length of a time interval in seconds
+#endif
+    double ticks() const { return double(value); }
+    int64_t getValue() const { return value; }
+    tsc_interval_t &operator=(int64_t nvalue) {
+      value = nvalue;
+      return *this;
+    }
+
+    friend class tsc_tick_count;
+
+    friend tsc_interval_t operator-(const tsc_tick_count &t1,
+                                    const tsc_tick_count &t0);
+    friend tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t &i1,
+                                    const tsc_tick_count::tsc_interval_t &i0);
+    friend tsc_interval_t &operator+=(tsc_tick_count::tsc_interval_t &i1,
+                                      const tsc_tick_count::tsc_interval_t &i0);
+  };
+
+#if KMP_HAVE___BUILTIN_READCYCLECOUNTER
+  tsc_tick_count()
+      : my_count(static_cast<int64_t>(__builtin_readcyclecounter())) {}
+#elif KMP_HAVE___RDTSC
+  tsc_tick_count() : my_count(static_cast<int64_t>(__rdtsc())) {}
+#else
+#error Must have high resolution timer defined
+#endif
+  tsc_tick_count(int64_t value) : my_count(value) {}
+  int64_t getValue() const { return my_count; }
+  tsc_tick_count later(tsc_tick_count const other) const {
+    return my_count > other.my_count ? (*this) : other;
+  }
+  tsc_tick_count earlier(tsc_tick_count const other) const {
+    return my_count < other.my_count ? (*this) : other;
+  }
+#if KMP_HAVE_TICK_TIME
+  static double tick_time(); // returns seconds per cycle (period) of clock
+#endif
+  static tsc_tick_count now() {
+    return tsc_tick_count();
+  } // returns the rdtsc register value
+  friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count &t1,
+                                                  const tsc_tick_count &t0);
+};
+
+inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count &t1,
+                                                const tsc_tick_count &t0) {
+  return tsc_tick_count::tsc_interval_t(t1.my_count - t0.my_count);
+}
+
+inline tsc_tick_count::tsc_interval_t
+operator-(const tsc_tick_count::tsc_interval_t &i1,
+          const tsc_tick_count::tsc_interval_t &i0) {
+  return tsc_tick_count::tsc_interval_t(i1.value - i0.value);
+}
+
+inline tsc_tick_count::tsc_interval_t &
+operator+=(tsc_tick_count::tsc_interval_t &i1,
+           const tsc_tick_count::tsc_interval_t &i0) {
+  i1.value += i0.value;
+  return i1;
+}
+
+#if KMP_HAVE_TICK_TIME
+inline double tsc_tick_count::tsc_interval_t::seconds() const {
+  return value * tick_time();
+}
+#endif
+
+extern std::string formatSI(double interval, int width, char unit);
+
+inline std::string formatSeconds(double interval, int width) {
+  return formatSI(interval, width, 'S');
+}
+
+inline std::string formatTicks(double interval, int width) {
+  return formatSI(interval, width, 'T');
+}
+
+#endif // KMP_STATS_TIMING_H
diff --git a/final/runtime/src/kmp_str.cpp b/final/runtime/src/kmp_str.cpp
new file mode 100644
index 0000000..fb748d1
--- /dev/null
+++ b/final/runtime/src/kmp_str.cpp
@@ -0,0 +1,751 @@
+/*
+ * kmp_str.cpp -- String manipulation routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_str.h"
+
+#include <stdarg.h> // va_*
+#include <stdio.h> // vsnprintf()
+#include <stdlib.h> // malloc(), realloc()
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+
+/* String buffer.
+
+   Usage:
+
+   // Declare buffer and initialize it.
+   kmp_str_buf_t  buffer;
+   __kmp_str_buf_init( & buffer );
+
+   // Print to buffer.
+   __kmp_str_buf_print(& buffer, "Error in file \"%s\" line %d\n", "foo.c", 12);
+   __kmp_str_buf_print(& buffer, "    <%s>\n", line);
+
+   // Use buffer contents. buffer.str is a pointer to data, buffer.used is a
+   // number of printed characters (not including terminating zero).
+   write( fd, buffer.str, buffer.used );
+
+   // Free buffer.
+   __kmp_str_buf_free( & buffer );
+
+   // Alternatively, you can detach allocated memory from buffer:
+   __kmp_str_buf_detach( & buffer );
+   return buffer.str;    // That memory should be freed eventually.
+
+   Notes:
+
+   * Buffer users may use buffer.str and buffer.used. Users should not change
+     any fields of buffer directly.
+   * buffer.str is never NULL. If buffer is empty, buffer.str points to empty
+     string ("").
+   * For performance reasons, buffer uses stack memory (buffer.bulk) first. If
+     stack memory is exhausted, buffer allocates memory on heap by malloc(), and
+     reallocates it by realloc() as amount of used memory grows.
+   * Buffer doubles amount of allocated memory each time it is exhausted.
+*/
+
+// TODO: __kmp_str_buf_print() can use thread local memory allocator.
+
+#define KMP_STR_BUF_INVARIANT(b)                                               \
+  {                                                                            \
+    KMP_DEBUG_ASSERT((b)->str != NULL);                                        \
+    KMP_DEBUG_ASSERT((b)->size >= sizeof((b)->bulk));                          \
+    KMP_DEBUG_ASSERT((b)->size % sizeof((b)->bulk) == 0);                      \
+    KMP_DEBUG_ASSERT((unsigned)(b)->used < (b)->size);                         \
+    KMP_DEBUG_ASSERT(                                                          \
+        (b)->size == sizeof((b)->bulk) ? (b)->str == &(b)->bulk[0] : 1);       \
+    KMP_DEBUG_ASSERT((b)->size > sizeof((b)->bulk) ? (b)->str != &(b)->bulk[0] \
+                                                   : 1);                       \
+  }
+
+void __kmp_str_buf_clear(kmp_str_buf_t *buffer) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  if (buffer->used > 0) {
+    buffer->used = 0;
+    buffer->str[0] = 0;
+  }
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_clear
+
+void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, int size) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  KMP_DEBUG_ASSERT(size >= 0);
+
+  if (buffer->size < (unsigned int)size) {
+    // Calculate buffer size.
+    do {
+      buffer->size *= 2;
+    } while (buffer->size < (unsigned int)size);
+
+    // Enlarge buffer.
+    if (buffer->str == &buffer->bulk[0]) {
+      buffer->str = (char *)KMP_INTERNAL_MALLOC(buffer->size);
+      if (buffer->str == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+      KMP_MEMCPY_S(buffer->str, buffer->size, buffer->bulk, buffer->used + 1);
+    } else {
+      buffer->str = (char *)KMP_INTERNAL_REALLOC(buffer->str, buffer->size);
+      if (buffer->str == NULL) {
+        KMP_FATAL(MemoryAllocFailed);
+      }
+    }
+  }
+
+  KMP_DEBUG_ASSERT(buffer->size > 0);
+  KMP_DEBUG_ASSERT(buffer->size >= (unsigned)size);
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_reserve
+
+void __kmp_str_buf_detach(kmp_str_buf_t *buffer) {
+  KMP_STR_BUF_INVARIANT(buffer);
+
+  // If internal bulk is used, allocate memory and copy it.
+  if (buffer->size <= sizeof(buffer->bulk)) {
+    buffer->str = (char *)KMP_INTERNAL_MALLOC(buffer->size);
+    if (buffer->str == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+    KMP_MEMCPY_S(buffer->str, buffer->size, buffer->bulk, buffer->used + 1);
+  }
+} // __kmp_str_buf_detach
+
+void __kmp_str_buf_free(kmp_str_buf_t *buffer) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  if (buffer->size > sizeof(buffer->bulk)) {
+    KMP_INTERNAL_FREE(buffer->str);
+  }
+  buffer->str = buffer->bulk;
+  buffer->size = sizeof(buffer->bulk);
+  buffer->used = 0;
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_free
+
+void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, int len) {
+  KMP_STR_BUF_INVARIANT(buffer);
+  KMP_DEBUG_ASSERT(str != NULL);
+  KMP_DEBUG_ASSERT(len >= 0);
+  __kmp_str_buf_reserve(buffer, buffer->used + len + 1);
+  KMP_MEMCPY(buffer->str + buffer->used, str, len);
+  buffer->str[buffer->used + len] = 0;
+  buffer->used += len;
+  KMP_STR_BUF_INVARIANT(buffer);
+} // __kmp_str_buf_cat
+
+void __kmp_str_buf_catbuf(kmp_str_buf_t *dest, const kmp_str_buf_t *src) {
+  KMP_DEBUG_ASSERT(dest);
+  KMP_DEBUG_ASSERT(src);
+  KMP_STR_BUF_INVARIANT(dest);
+  KMP_STR_BUF_INVARIANT(src);
+  if (!src->str || !src->used)
+    return;
+  __kmp_str_buf_reserve(dest, dest->used + src->used + 1);
+  KMP_MEMCPY(dest->str + dest->used, src->str, src->used);
+  dest->str[dest->used + src->used] = 0;
+  dest->used += src->used;
+  KMP_STR_BUF_INVARIANT(dest);
+} // __kmp_str_buf_catbuf
+
+// Return the number of characters written
+int __kmp_str_buf_vprint(kmp_str_buf_t *buffer, char const *format,
+                         va_list args) {
+  int rc;
+  KMP_STR_BUF_INVARIANT(buffer);
+
+  for (;;) {
+    int const free = buffer->size - buffer->used;
+    int size;
+
+    // Try to format string.
+    {
+/* On Linux* OS Intel(R) 64, vsnprintf() modifies args argument, so vsnprintf()
+   crashes if it is called for the second time with the same args. To prevent
+   the crash, we have to pass a fresh intact copy of args to vsnprintf() on each
+   iteration.
+
+   Unfortunately, standard va_copy() macro is not available on Windows* OS.
+   However, it seems vsnprintf() does not modify args argument on Windows* OS.
+*/
+
+#if !KMP_OS_WINDOWS
+      va_list _args;
+      va_copy(_args, args); // Make copy of args.
+#define args _args // Substitute args with its copy, _args.
+#endif // KMP_OS_WINDOWS
+      rc = KMP_VSNPRINTF(buffer->str + buffer->used, free, format, args);
+#if !KMP_OS_WINDOWS
+#undef args // Remove substitution.
+      va_end(_args);
+#endif // KMP_OS_WINDOWS
+    }
+
+    // No errors, string has been formatted.
+    if (rc >= 0 && rc < free) {
+      buffer->used += rc;
+      break;
+    }
+
+    // Error occurred, buffer is too small.
+    if (rc >= 0) {
+      // C99-conforming implementation of vsnprintf returns required buffer size
+      size = buffer->used + rc + 1;
+    } else {
+      // Older implementations just return -1. Double buffer size.
+      size = buffer->size * 2;
+    }
+
+    // Enlarge buffer.
+    __kmp_str_buf_reserve(buffer, size);
+
+    // And try again.
+  }
+
+  KMP_DEBUG_ASSERT(buffer->size > 0);
+  KMP_STR_BUF_INVARIANT(buffer);
+  return rc;
+} // __kmp_str_buf_vprint
+
+// Return the number of characters written
+int __kmp_str_buf_print(kmp_str_buf_t *buffer, char const *format, ...) {
+  int rc;
+  va_list args;
+  va_start(args, format);
+  rc = __kmp_str_buf_vprint(buffer, format, args);
+  va_end(args);
+  return rc;
+} // __kmp_str_buf_print
+
+/* The function prints specified size to buffer. Size is expressed using biggest
+   possible unit, for example 1024 is printed as "1k". */
+void __kmp_str_buf_print_size(kmp_str_buf_t *buf, size_t size) {
+  char const *names[] = {"", "k", "M", "G", "T", "P", "E", "Z", "Y"};
+  int const units = sizeof(names) / sizeof(char const *);
+  int u = 0;
+  if (size > 0) {
+    while ((size % 1024 == 0) && (u + 1 < units)) {
+      size = size / 1024;
+      ++u;
+    }
+  }
+
+  __kmp_str_buf_print(buf, "%" KMP_SIZE_T_SPEC "%s", size, names[u]);
+} // __kmp_str_buf_print_size
+
+void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path) {
+  fname->path = NULL;
+  fname->dir = NULL;
+  fname->base = NULL;
+
+  if (path != NULL) {
+    char *slash = NULL; // Pointer to the last character of dir.
+    char *base = NULL; // Pointer to the beginning of basename.
+    fname->path = __kmp_str_format("%s", path);
+    // Original code used strdup() function to copy a string, but on Windows* OS
+    // Intel(R) 64 it causes assertioon id debug heap, so I had to replace
+    // strdup with __kmp_str_format().
+    if (KMP_OS_WINDOWS) {
+      __kmp_str_replace(fname->path, '\\', '/');
+    }
+    fname->dir = __kmp_str_format("%s", fname->path);
+    slash = strrchr(fname->dir, '/');
+    if (KMP_OS_WINDOWS &&
+        slash == NULL) { // On Windows* OS, if slash not found,
+      char first = TOLOWER(fname->dir[0]); // look for drive.
+      if ('a' <= first && first <= 'z' && fname->dir[1] == ':') {
+        slash = &fname->dir[1];
+      }
+    }
+    base = (slash == NULL ? fname->dir : slash + 1);
+    fname->base = __kmp_str_format("%s", base); // Copy basename
+    *base = 0; // and truncate dir.
+  }
+
+} // kmp_str_fname_init
+
+void __kmp_str_fname_free(kmp_str_fname_t *fname) {
+  __kmp_str_free(&fname->path);
+  __kmp_str_free(&fname->dir);
+  __kmp_str_free(&fname->base);
+} // kmp_str_fname_free
+
+int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern) {
+  int dir_match = 1;
+  int base_match = 1;
+
+  if (pattern != NULL) {
+    kmp_str_fname_t ptrn;
+    __kmp_str_fname_init(&ptrn, pattern);
+    dir_match = strcmp(ptrn.dir, "*/") == 0 ||
+                (fname->dir != NULL && __kmp_str_eqf(fname->dir, ptrn.dir));
+    base_match = strcmp(ptrn.base, "*") == 0 ||
+                 (fname->base != NULL && __kmp_str_eqf(fname->base, ptrn.base));
+    __kmp_str_fname_free(&ptrn);
+  }
+
+  return dir_match && base_match;
+} // __kmp_str_fname_match
+
+kmp_str_loc_t __kmp_str_loc_init(char const *psource, int init_fname) {
+  kmp_str_loc_t loc;
+
+  loc._bulk = NULL;
+  loc.file = NULL;
+  loc.func = NULL;
+  loc.line = 0;
+  loc.col = 0;
+
+  if (psource != NULL) {
+    char *str = NULL;
+    char *dummy = NULL;
+    char *line = NULL;
+    char *col = NULL;
+
+    // Copy psource to keep it intact.
+    loc._bulk = __kmp_str_format("%s", psource);
+
+    // Parse psource string: ";file;func;line;col;;"
+    str = loc._bulk;
+    __kmp_str_split(str, ';', &dummy, &str);
+    __kmp_str_split(str, ';', &loc.file, &str);
+    __kmp_str_split(str, ';', &loc.func, &str);
+    __kmp_str_split(str, ';', &line, &str);
+    __kmp_str_split(str, ';', &col, &str);
+
+    // Convert line and col into numberic values.
+    if (line != NULL) {
+      loc.line = atoi(line);
+      if (loc.line < 0) {
+        loc.line = 0;
+      }
+    }
+    if (col != NULL) {
+      loc.col = atoi(col);
+      if (loc.col < 0) {
+        loc.col = 0;
+      }
+    }
+  }
+
+  __kmp_str_fname_init(&loc.fname, init_fname ? loc.file : NULL);
+
+  return loc;
+} // kmp_str_loc_init
+
+void __kmp_str_loc_free(kmp_str_loc_t *loc) {
+  __kmp_str_fname_free(&loc->fname);
+  __kmp_str_free(&(loc->_bulk));
+  loc->file = NULL;
+  loc->func = NULL;
+} // kmp_str_loc_free
+
+/* This function is intended to compare file names. On Windows* OS file names
+   are case-insensitive, so functions performs case-insensitive comparison. On
+   Linux* OS it performs case-sensitive comparison. Note: The function returns
+   *true* if strings are *equal*. */
+int __kmp_str_eqf( // True, if strings are equal, false otherwise.
+    char const *lhs, // First string.
+    char const *rhs // Second string.
+    ) {
+  int result;
+#if KMP_OS_WINDOWS
+  result = (_stricmp(lhs, rhs) == 0);
+#else
+  result = (strcmp(lhs, rhs) == 0);
+#endif
+  return result;
+} // __kmp_str_eqf
+
+/* This function is like sprintf, but it *allocates* new buffer, which must be
+   freed eventually by __kmp_str_free(). The function is very convenient for
+   constructing strings, it successfully replaces strdup(), strcat(), it frees
+   programmer from buffer allocations and helps to avoid buffer overflows.
+   Examples:
+
+   str = __kmp_str_format("%s", orig); //strdup() doesn't care about buffer size
+   __kmp_str_free( & str );
+   str = __kmp_str_format( "%s%s", orig1, orig2 ); // strcat(), doesn't care
+                                                   // about buffer size.
+   __kmp_str_free( & str );
+   str = __kmp_str_format( "%s/%s.txt", path, file ); // constructing string.
+   __kmp_str_free( & str );
+
+   Performance note:
+   This function allocates memory with malloc() calls, so do not call it from
+   performance-critical code. In performance-critical code consider using
+   kmp_str_buf_t instead, since it uses stack-allocated buffer for short
+   strings.
+
+   Why does this function use malloc()?
+   1. __kmp_allocate() returns cache-aligned memory allocated with malloc().
+      There are no reasons in using __kmp_allocate() for strings due to extra
+      overhead while cache-aligned memory is not necessary.
+   2. __kmp_thread_malloc() cannot be used because it requires pointer to thread
+      structure. We need to perform string operations during library startup
+      (for example, in __kmp_register_library_startup()) when no thread
+      structures are allocated yet.
+   So standard malloc() is the only available option.
+*/
+
+char *__kmp_str_format( // Allocated string.
+    char const *format, // Format string.
+    ... // Other parameters.
+    ) {
+  va_list args;
+  int size = 512;
+  char *buffer = NULL;
+  int rc;
+
+  // Allocate buffer.
+  buffer = (char *)KMP_INTERNAL_MALLOC(size);
+  if (buffer == NULL) {
+    KMP_FATAL(MemoryAllocFailed);
+  }
+
+  for (;;) {
+    // Try to format string.
+    va_start(args, format);
+    rc = KMP_VSNPRINTF(buffer, size, format, args);
+    va_end(args);
+
+    // No errors, string has been formatted.
+    if (rc >= 0 && rc < size) {
+      break;
+    }
+
+    // Error occurred, buffer is too small.
+    if (rc >= 0) {
+      // C99-conforming implementation of vsnprintf returns required buffer
+      // size.
+      size = rc + 1;
+    } else {
+      // Older implementations just return -1.
+      size = size * 2;
+    }
+
+    // Enlarge buffer and try again.
+    buffer = (char *)KMP_INTERNAL_REALLOC(buffer, size);
+    if (buffer == NULL) {
+      KMP_FATAL(MemoryAllocFailed);
+    }
+  }
+
+  return buffer;
+} // func __kmp_str_format
+
+void __kmp_str_free(char **str) {
+  KMP_DEBUG_ASSERT(str != NULL);
+  KMP_INTERNAL_FREE(*str);
+  *str = NULL;
+} // func __kmp_str_free
+
+/* If len is zero, returns true iff target and data have exact case-insensitive
+   match. If len is negative, returns true iff target is a case-insensitive
+   substring of data. If len is positive, returns true iff target is a
+   case-insensitive substring of data or vice versa, and neither is shorter than
+   len. */
+int __kmp_str_match(char const *target, int len, char const *data) {
+  int i;
+  if (target == NULL || data == NULL) {
+    return FALSE;
+  }
+  for (i = 0; target[i] && data[i]; ++i) {
+    if (TOLOWER(target[i]) != TOLOWER(data[i])) {
+      return FALSE;
+    }
+  }
+  return ((len > 0) ? i >= len : (!target[i] && (len || !data[i])));
+} // __kmp_str_match
+
+int __kmp_str_match_false(char const *data) {
+  int result =
+      __kmp_str_match("false", 1, data) || __kmp_str_match("off", 2, data) ||
+      __kmp_str_match("0", 1, data) || __kmp_str_match(".false.", 2, data) ||
+      __kmp_str_match(".f.", 2, data) || __kmp_str_match("no", 1, data) ||
+      __kmp_str_match("disabled", 0, data);
+  return result;
+} // __kmp_str_match_false
+
+int __kmp_str_match_true(char const *data) {
+  int result =
+      __kmp_str_match("true", 1, data) || __kmp_str_match("on", 2, data) ||
+      __kmp_str_match("1", 1, data) || __kmp_str_match(".true.", 2, data) ||
+      __kmp_str_match(".t.", 2, data) || __kmp_str_match("yes", 1, data) ||
+      __kmp_str_match("enabled", 0, data);
+  return result;
+} // __kmp_str_match_true
+
+void __kmp_str_replace(char *str, char search_for, char replace_with) {
+  char *found = NULL;
+
+  found = strchr(str, search_for);
+  while (found) {
+    *found = replace_with;
+    found = strchr(found + 1, search_for);
+  }
+} // __kmp_str_replace
+
+void __kmp_str_split(char *str, // I: String to split.
+                     char delim, // I: Character to split on.
+                     char **head, // O: Pointer to head (may be NULL).
+                     char **tail // O: Pointer to tail (may be NULL).
+                     ) {
+  char *h = str;
+  char *t = NULL;
+  if (str != NULL) {
+    char *ptr = strchr(str, delim);
+    if (ptr != NULL) {
+      *ptr = 0;
+      t = ptr + 1;
+    }
+  }
+  if (head != NULL) {
+    *head = h;
+  }
+  if (tail != NULL) {
+    *tail = t;
+  }
+} // __kmp_str_split
+
+/* strtok_r() is not available on Windows* OS. This function reimplements
+   strtok_r(). */
+char *__kmp_str_token(
+    char *str, // String to split into tokens. Note: String *is* modified!
+    char const *delim, // Delimiters.
+    char **buf // Internal buffer.
+    ) {
+  char *token = NULL;
+#if KMP_OS_WINDOWS
+  // On Windows* OS there is no strtok_r() function. Let us implement it.
+  if (str != NULL) {
+    *buf = str; // First call, initialize buf.
+  }
+  *buf += strspn(*buf, delim); // Skip leading delimiters.
+  if (**buf != 0) { // Rest of the string is not yet empty.
+    token = *buf; // Use it as result.
+    *buf += strcspn(*buf, delim); // Skip non-delimiters.
+    if (**buf != 0) { // Rest of the string is not yet empty.
+      **buf = 0; // Terminate token here.
+      *buf += 1; // Advance buf to start with the next token next time.
+    }
+  }
+#else
+  // On Linux* OS and OS X*, strtok_r() is available. Let us use it.
+  token = strtok_r(str, delim, buf);
+#endif
+  return token;
+} // __kmp_str_token
+
+int __kmp_str_to_int(char const *str, char sentinel) {
+  int result, factor;
+  char const *t;
+
+  result = 0;
+
+  for (t = str; *t != '\0'; ++t) {
+    if (*t < '0' || *t > '9')
+      break;
+    result = (result * 10) + (*t - '0');
+  }
+
+  switch (*t) {
+  case '\0': /* the current default for no suffix is bytes */
+    factor = 1;
+    break;
+  case 'b':
+  case 'B': /* bytes */
+    ++t;
+    factor = 1;
+    break;
+  case 'k':
+  case 'K': /* kilo-bytes */
+    ++t;
+    factor = 1024;
+    break;
+  case 'm':
+  case 'M': /* mega-bytes */
+    ++t;
+    factor = (1024 * 1024);
+    break;
+  default:
+    if (*t != sentinel)
+      return (-1);
+    t = "";
+    factor = 1;
+  }
+
+  if (result > (INT_MAX / factor))
+    result = INT_MAX;
+  else
+    result *= factor;
+
+  return (*t != 0 ? 0 : result);
+} // __kmp_str_to_int
+
+/* The routine parses input string. It is expected it is a unsigned integer with
+   optional unit. Units are: "b" for bytes, "kb" or just "k" for kilobytes, "mb"
+   or "m" for megabytes, ..., "yb" or "y" for yottabytes. :-) Unit name is
+   case-insensitive. The routine returns 0 if everything is ok, or error code:
+   -1 in case of overflow, -2 in case of unknown unit. *size is set to parsed
+   value. In case of overflow *size is set to KMP_SIZE_T_MAX, in case of unknown
+   unit *size is set to zero. */
+void __kmp_str_to_size( // R: Error code.
+    char const *str, // I: String of characters, unsigned number and unit ("b",
+    // "kb", etc).
+    size_t *out, // O: Parsed number.
+    size_t dfactor, // I: The factor if none of the letters specified.
+    char const **error // O: Null if everything is ok, error message otherwise.
+    ) {
+
+  size_t value = 0;
+  size_t factor = 0;
+  int overflow = 0;
+  int i = 0;
+  int digit;
+
+  KMP_DEBUG_ASSERT(str != NULL);
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  // Parse number.
+  if (str[i] < '0' || str[i] > '9') {
+    *error = KMP_I18N_STR(NotANumber);
+    return;
+  }
+  do {
+    digit = str[i] - '0';
+    overflow = overflow || (value > (KMP_SIZE_T_MAX - digit) / 10);
+    value = (value * 10) + digit;
+    ++i;
+  } while (str[i] >= '0' && str[i] <= '9');
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+// Parse unit.
+#define _case(ch, exp)                                                         \
+  case ch:                                                                     \
+  case ch - ('a' - 'A'): {                                                     \
+    size_t shift = (exp)*10;                                                   \
+    ++i;                                                                       \
+    if (shift < sizeof(size_t) * 8) {                                          \
+      factor = (size_t)(1) << shift;                                           \
+    } else {                                                                   \
+      overflow = 1;                                                            \
+    }                                                                          \
+  } break;
+  switch (str[i]) {
+    _case('k', 1); // Kilo
+    _case('m', 2); // Mega
+    _case('g', 3); // Giga
+    _case('t', 4); // Tera
+    _case('p', 5); // Peta
+    _case('e', 6); // Exa
+    _case('z', 7); // Zetta
+    _case('y', 8); // Yotta
+    // Oops. No more units...
+  }
+#undef _case
+  if (str[i] == 'b' || str[i] == 'B') { // Skip optional "b".
+    if (factor == 0) {
+      factor = 1;
+    }
+    ++i;
+  }
+  if (!(str[i] == ' ' || str[i] == '\t' || str[i] == 0)) { // Bad unit
+    *error = KMP_I18N_STR(BadUnit);
+    return;
+  }
+
+  if (factor == 0) {
+    factor = dfactor;
+  }
+
+  // Apply factor.
+  overflow = overflow || (value > (KMP_SIZE_T_MAX / factor));
+  value *= factor;
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  if (str[i] != 0) {
+    *error = KMP_I18N_STR(IllegalCharacters);
+    return;
+  }
+
+  if (overflow) {
+    *error = KMP_I18N_STR(ValueTooLarge);
+    *out = KMP_SIZE_T_MAX;
+    return;
+  }
+
+  *error = NULL;
+  *out = value;
+} // __kmp_str_to_size
+
+void __kmp_str_to_uint( // R: Error code.
+    char const *str, // I: String of characters, unsigned number.
+    kmp_uint64 *out, // O: Parsed number.
+    char const **error // O: Null if everything is ok, error message otherwise.
+    ) {
+  size_t value = 0;
+  int overflow = 0;
+  int i = 0;
+  int digit;
+
+  KMP_DEBUG_ASSERT(str != NULL);
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  // Parse number.
+  if (str[i] < '0' || str[i] > '9') {
+    *error = KMP_I18N_STR(NotANumber);
+    return;
+  }
+  do {
+    digit = str[i] - '0';
+    overflow = overflow || (value > (KMP_SIZE_T_MAX - digit) / 10);
+    value = (value * 10) + digit;
+    ++i;
+  } while (str[i] >= '0' && str[i] <= '9');
+
+  // Skip spaces.
+  while (str[i] == ' ' || str[i] == '\t') {
+    ++i;
+  }
+
+  if (str[i] != 0) {
+    *error = KMP_I18N_STR(IllegalCharacters);
+    return;
+  }
+
+  if (overflow) {
+    *error = KMP_I18N_STR(ValueTooLarge);
+    *out = (kmp_uint64)-1;
+    return;
+  }
+
+  *error = NULL;
+  *out = value;
+} // __kmp_str_to_unit
+
+// end of file //
diff --git a/final/runtime/src/kmp_str.h b/final/runtime/src/kmp_str.h
new file mode 100644
index 0000000..09faadb
--- /dev/null
+++ b/final/runtime/src/kmp_str.h
@@ -0,0 +1,125 @@
+/*
+ * kmp_str.h -- String manipulation routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_STR_H
+#define KMP_STR_H
+
+#include <stdarg.h>
+#include <string.h>
+
+#include "kmp_os.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#if KMP_OS_WINDOWS
+#define strdup _strdup
+#endif
+
+/*  some macros to replace ctype.h functions  */
+#define TOLOWER(c) ((((c) >= 'A') && ((c) <= 'Z')) ? ((c) + 'a' - 'A') : (c))
+
+struct kmp_str_buf {
+  char *str; // Pointer to buffer content, read only.
+  unsigned int size; // Do not change this field!
+  int used; // Number of characters printed to buffer, read only.
+  char bulk[512]; // Do not use this field!
+}; // struct kmp_str_buf
+typedef struct kmp_str_buf kmp_str_buf_t;
+
+#define __kmp_str_buf_init(b)                                                  \
+  {                                                                            \
+    (b)->str = (b)->bulk;                                                      \
+    (b)->size = sizeof((b)->bulk);                                             \
+    (b)->used = 0;                                                             \
+    (b)->bulk[0] = 0;                                                          \
+  }
+
+void __kmp_str_buf_clear(kmp_str_buf_t *buffer);
+void __kmp_str_buf_reserve(kmp_str_buf_t *buffer, int size);
+void __kmp_str_buf_detach(kmp_str_buf_t *buffer);
+void __kmp_str_buf_free(kmp_str_buf_t *buffer);
+void __kmp_str_buf_cat(kmp_str_buf_t *buffer, char const *str, int len);
+void __kmp_str_buf_catbuf(kmp_str_buf_t *dest, const kmp_str_buf_t *src);
+int __kmp_str_buf_vprint(kmp_str_buf_t *buffer, char const *format,
+                         va_list args);
+int __kmp_str_buf_print(kmp_str_buf_t *buffer, char const *format, ...);
+void __kmp_str_buf_print_size(kmp_str_buf_t *buffer, size_t size);
+
+/* File name parser.
+   Usage:
+
+   kmp_str_fname_t fname = __kmp_str_fname_init( path );
+   // Use fname.path (copy of original path ), fname.dir, fname.base.
+   // Note fname.dir concatenated with fname.base gives exact copy of path.
+   __kmp_str_fname_free( & fname );
+*/
+struct kmp_str_fname {
+  char *path;
+  char *dir;
+  char *base;
+}; // struct kmp_str_fname
+typedef struct kmp_str_fname kmp_str_fname_t;
+void __kmp_str_fname_init(kmp_str_fname_t *fname, char const *path);
+void __kmp_str_fname_free(kmp_str_fname_t *fname);
+// Compares file name with specified patern. If pattern is NULL, any fname
+// matched.
+int __kmp_str_fname_match(kmp_str_fname_t const *fname, char const *pattern);
+
+/* The compiler provides source locations in string form
+   ";file;func;line;col;;". It is not convenient for manupulation. This
+   structure keeps source location in more convenient form.
+   Usage:
+
+   kmp_str_loc_t loc = __kmp_str_loc_init( ident->psource, 0 );
+   // use loc.file, loc.func, loc.line, loc.col.
+   // loc.fname is available if second argument of __kmp_str_loc_init is true.
+   __kmp_str_loc_free( & loc );
+
+   If psource is NULL or does not follow format above, file and/or func may be
+   NULL pointers.
+*/
+struct kmp_str_loc {
+  char *_bulk; // Do not use thid field.
+  kmp_str_fname_t fname; // Will be initialized if init_fname is true.
+  char *file;
+  char *func;
+  int line;
+  int col;
+}; // struct kmp_str_loc
+typedef struct kmp_str_loc kmp_str_loc_t;
+kmp_str_loc_t __kmp_str_loc_init(char const *psource, int init_fname);
+void __kmp_str_loc_free(kmp_str_loc_t *loc);
+
+int __kmp_str_eqf(char const *lhs, char const *rhs);
+char *__kmp_str_format(char const *format, ...);
+void __kmp_str_free(char **str);
+int __kmp_str_match(char const *target, int len, char const *data);
+int __kmp_str_match_false(char const *data);
+int __kmp_str_match_true(char const *data);
+void __kmp_str_replace(char *str, char search_for, char replace_with);
+void __kmp_str_split(char *str, char delim, char **head, char **tail);
+char *__kmp_str_token(char *str, char const *delim, char **buf);
+int __kmp_str_to_int(char const *str, char sentinel);
+
+void __kmp_str_to_size(char const *str, size_t *out, size_t dfactor,
+                       char const **error);
+void __kmp_str_to_uint(char const *str, kmp_uint64 *out, char const **error);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_STR_H
+
+// end of file //
diff --git a/final/runtime/src/kmp_stub.cpp b/final/runtime/src/kmp_stub.cpp
new file mode 100644
index 0000000..badbbde
--- /dev/null
+++ b/final/runtime/src/kmp_stub.cpp
@@ -0,0 +1,385 @@
+/*
+ * kmp_stub.cpp -- stub versions of user-callable OpenMP RT functions.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#define __KMP_IMP
+#include "omp.h" // omp_* declarations, must be included before "kmp.h"
+#include "kmp.h" // KMP_DEFAULT_STKSIZE
+#include "kmp_stub.h"
+
+#if KMP_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+// Moved from omp.h
+#define omp_set_max_active_levels ompc_set_max_active_levels
+#define omp_set_schedule ompc_set_schedule
+#define omp_get_ancestor_thread_num ompc_get_ancestor_thread_num
+#define omp_get_team_size ompc_get_team_size
+
+#define omp_set_num_threads ompc_set_num_threads
+#define omp_set_dynamic ompc_set_dynamic
+#define omp_set_nested ompc_set_nested
+#define omp_set_affinity_format ompc_set_affinity_format
+#define omp_get_affinity_format ompc_get_affinity_format
+#define omp_display_affinity ompc_display_affinity
+#define omp_capture_affinity ompc_capture_affinity
+#define kmp_set_stacksize kmpc_set_stacksize
+#define kmp_set_stacksize_s kmpc_set_stacksize_s
+#define kmp_set_blocktime kmpc_set_blocktime
+#define kmp_set_library kmpc_set_library
+#define kmp_set_defaults kmpc_set_defaults
+#define kmp_set_disp_num_buffers kmpc_set_disp_num_buffers
+#define kmp_malloc kmpc_malloc
+#define kmp_aligned_malloc kmpc_aligned_malloc
+#define kmp_calloc kmpc_calloc
+#define kmp_realloc kmpc_realloc
+#define kmp_free kmpc_free
+
+#if KMP_OS_WINDOWS
+static double frequency = 0.0;
+#endif
+
+// Helper functions.
+static size_t __kmps_init() {
+  static int initialized = 0;
+  static size_t dummy = 0;
+  if (!initialized) {
+    // TODO: Analyze KMP_VERSION environment variable, print
+    // __kmp_version_copyright and __kmp_version_build_time.
+    // WARNING: Do not use "fprintf(stderr, ...)" because it will cause
+    // unresolved "__iob" symbol (see C70080). We need to extract __kmp_printf()
+    // stuff from kmp_runtime.cpp and use it.
+
+    // Trick with dummy variable forces linker to keep __kmp_version_copyright
+    // and __kmp_version_build_time strings in executable file (in case of
+    // static linkage). When KMP_VERSION analysis is implemented, dummy
+    // variable should be deleted, function should return void.
+    dummy = __kmp_version_copyright - __kmp_version_build_time;
+
+#if KMP_OS_WINDOWS
+    LARGE_INTEGER freq;
+    BOOL status = QueryPerformanceFrequency(&freq);
+    if (status) {
+      frequency = double(freq.QuadPart);
+    }
+#endif
+
+    initialized = 1;
+  }
+  return dummy;
+} // __kmps_init
+
+#define i __kmps_init();
+
+/* set API functions */
+void omp_set_num_threads(omp_int_t num_threads) { i; }
+void omp_set_dynamic(omp_int_t dynamic) {
+  i;
+  __kmps_set_dynamic(dynamic);
+}
+void omp_set_nested(omp_int_t nested) {
+  i;
+  __kmps_set_nested(nested);
+}
+void omp_set_max_active_levels(omp_int_t max_active_levels) { i; }
+void omp_set_schedule(omp_sched_t kind, omp_int_t modifier) {
+  i;
+  __kmps_set_schedule((kmp_sched_t)kind, modifier);
+}
+int omp_get_ancestor_thread_num(omp_int_t level) {
+  i;
+  return (level) ? (-1) : (0);
+}
+int omp_get_team_size(omp_int_t level) {
+  i;
+  return (level) ? (-1) : (1);
+}
+int kmpc_set_affinity_mask_proc(int proc, void **mask) {
+  i;
+  return -1;
+}
+int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
+  i;
+  return -1;
+}
+int kmpc_get_affinity_mask_proc(int proc, void **mask) {
+  i;
+  return -1;
+}
+
+/* kmp API functions */
+void kmp_set_stacksize(omp_int_t arg) {
+  i;
+  __kmps_set_stacksize(arg);
+}
+void kmp_set_stacksize_s(size_t arg) {
+  i;
+  __kmps_set_stacksize(arg);
+}
+void kmp_set_blocktime(omp_int_t arg) {
+  i;
+  __kmps_set_blocktime(arg);
+}
+void kmp_set_library(omp_int_t arg) {
+  i;
+  __kmps_set_library(arg);
+}
+void kmp_set_defaults(char const *str) { i; }
+void kmp_set_disp_num_buffers(omp_int_t arg) { i; }
+
+/* KMP memory management functions. */
+void *kmp_malloc(size_t size) {
+  i;
+  void *res;
+#if KMP_OS_WINDOWS
+  // If succesfull returns a pointer to the memory block, otherwise returns
+  // NULL.
+  // Sets errno to ENOMEM or EINVAL if memory allocation failed or parameter
+  // validation failed.
+  res = _aligned_malloc(size, 1);
+#else
+  res = malloc(size);
+#endif
+  return res;
+}
+void *kmp_aligned_malloc(size_t sz, size_t a) {
+  i;
+  int err;
+  void *res;
+#if KMP_OS_WINDOWS
+  res = _aligned_malloc(sz, a);
+#else
+  if (err = posix_memalign(&res, a, sz)) {
+    errno = err; // can be EINVAL or ENOMEM
+    res = NULL;
+  }
+#endif
+  return res;
+}
+void *kmp_calloc(size_t nelem, size_t elsize) {
+  i;
+  void *res;
+#if KMP_OS_WINDOWS
+  res = _aligned_recalloc(NULL, nelem, elsize, 1);
+#else
+  res = calloc(nelem, elsize);
+#endif
+  return res;
+}
+void *kmp_realloc(void *ptr, size_t size) {
+  i;
+  void *res;
+#if KMP_OS_WINDOWS
+  res = _aligned_realloc(ptr, size, 1);
+#else
+  res = realloc(ptr, size);
+#endif
+  return res;
+}
+void kmp_free(void *ptr) {
+  i;
+#if KMP_OS_WINDOWS
+  _aligned_free(ptr);
+#else
+  free(ptr);
+#endif
+}
+
+static int __kmps_blocktime = INT_MAX;
+
+void __kmps_set_blocktime(int arg) {
+  i;
+  __kmps_blocktime = arg;
+} // __kmps_set_blocktime
+
+int __kmps_get_blocktime(void) {
+  i;
+  return __kmps_blocktime;
+} // __kmps_get_blocktime
+
+static int __kmps_dynamic = 0;
+
+void __kmps_set_dynamic(int arg) {
+  i;
+  __kmps_dynamic = arg;
+} // __kmps_set_dynamic
+
+int __kmps_get_dynamic(void) {
+  i;
+  return __kmps_dynamic;
+} // __kmps_get_dynamic
+
+static int __kmps_library = 1000;
+
+void __kmps_set_library(int arg) {
+  i;
+  __kmps_library = arg;
+} // __kmps_set_library
+
+int __kmps_get_library(void) {
+  i;
+  return __kmps_library;
+} // __kmps_get_library
+
+static int __kmps_nested = 0;
+
+void __kmps_set_nested(int arg) {
+  i;
+  __kmps_nested = arg;
+} // __kmps_set_nested
+
+int __kmps_get_nested(void) {
+  i;
+  return __kmps_nested;
+} // __kmps_get_nested
+
+static size_t __kmps_stacksize = KMP_DEFAULT_STKSIZE;
+
+void __kmps_set_stacksize(int arg) {
+  i;
+  __kmps_stacksize = arg;
+} // __kmps_set_stacksize
+
+int __kmps_get_stacksize(void) {
+  i;
+  return __kmps_stacksize;
+} // __kmps_get_stacksize
+
+static kmp_sched_t __kmps_sched_kind = kmp_sched_default;
+static int __kmps_sched_modifier = 0;
+
+void __kmps_set_schedule(kmp_sched_t kind, int modifier) {
+  i;
+  __kmps_sched_kind = kind;
+  __kmps_sched_modifier = modifier;
+} // __kmps_set_schedule
+
+void __kmps_get_schedule(kmp_sched_t *kind, int *modifier) {
+  i;
+  *kind = __kmps_sched_kind;
+  *modifier = __kmps_sched_modifier;
+} // __kmps_get_schedule
+
+kmp_proc_bind_t __kmps_get_proc_bind(void) {
+  i;
+  return 0;
+} // __kmps_get_proc_bind
+
+double __kmps_get_wtime(void) {
+  // Elapsed wall clock time (in second) from "sometime in the past".
+  double wtime = 0.0;
+  i;
+#if KMP_OS_WINDOWS
+  if (frequency > 0.0) {
+    LARGE_INTEGER now;
+    BOOL status = QueryPerformanceCounter(&now);
+    if (status) {
+      wtime = double(now.QuadPart) / frequency;
+    }
+  }
+#else
+  // gettimeofday() returns seconds and microseconds since the Epoch.
+  struct timeval tval;
+  int rc;
+  rc = gettimeofday(&tval, NULL);
+  if (rc == 0) {
+    wtime = (double)(tval.tv_sec) + 1.0E-06 * (double)(tval.tv_usec);
+  } else {
+    // TODO: Assert or abort here.
+  }
+#endif
+  return wtime;
+} // __kmps_get_wtime
+
+double __kmps_get_wtick(void) {
+  // Number of seconds between successive clock ticks.
+  double wtick = 0.0;
+  i;
+#if KMP_OS_WINDOWS
+  {
+    DWORD increment;
+    DWORD adjustment;
+    BOOL disabled;
+    BOOL rc;
+    rc = GetSystemTimeAdjustment(&adjustment, &increment, &disabled);
+    if (rc) {
+      wtick = 1.0E-07 * (double)(disabled ? increment : adjustment);
+    } else {
+      // TODO: Assert or abort here.
+      wtick = 1.0E-03;
+    }
+  }
+#else
+  // TODO: gettimeofday() returns in microseconds, but what the precision?
+  wtick = 1.0E-06;
+#endif
+  return wtick;
+} // __kmps_get_wtick
+
+/* OpenMP 5.0 Memory Management */
+#if KMP_OS_WINDOWS
+omp_allocator_handle_t const omp_null_allocator = 0;
+omp_allocator_handle_t const omp_default_mem_alloc =
+    (omp_allocator_handle_t const)1;
+omp_allocator_handle_t const omp_large_cap_mem_alloc =
+    (omp_allocator_handle_t const)2;
+omp_allocator_handle_t const omp_const_mem_alloc =
+    (omp_allocator_handle_t const)3;
+omp_allocator_handle_t const omp_high_bw_mem_alloc =
+    (omp_allocator_handle_t const)4;
+omp_allocator_handle_t const omp_low_lat_mem_alloc =
+    (omp_allocator_handle_t const)5;
+omp_allocator_handle_t const omp_cgroup_mem_alloc =
+    (omp_allocator_handle_t const)6;
+omp_allocator_handle_t const omp_pteam_mem_alloc =
+    (omp_allocator_handle_t const)7;
+omp_allocator_handle_t const omp_thread_mem_alloc =
+    (omp_allocator_handle_t const)8;
+
+omp_memspace_handle_t const omp_default_mem_space =
+    (omp_memspace_handle_t const)0;
+omp_memspace_handle_t const omp_large_cap_mem_space =
+    (omp_memspace_handle_t const)1;
+omp_memspace_handle_t const omp_const_mem_space =
+    (omp_memspace_handle_t const)2;
+omp_memspace_handle_t const omp_high_bw_mem_space =
+    (omp_memspace_handle_t const)3;
+omp_memspace_handle_t const omp_low_lat_mem_space =
+    (omp_memspace_handle_t const)4;
+#endif /* KMP_OS_WINDOWS */
+void *omp_alloc(size_t size, const omp_allocator_handle_t allocator) {
+  i;
+  return malloc(size);
+}
+void omp_free(void *ptr, const omp_allocator_handle_t allocator) {
+  i;
+  free(ptr);
+}
+/* OpenMP 5.0 Affinity Format */
+void omp_set_affinity_format(char const *format) { i; }
+size_t omp_get_affinity_format(char *buffer, size_t size) {
+  i;
+  return 0;
+}
+void omp_display_affinity(char const *format) { i; }
+size_t omp_capture_affinity(char *buffer, size_t buf_size, char const *format) {
+  i;
+  return 0;
+}
+
+// end of file //
diff --git a/final/runtime/src/kmp_stub.h b/final/runtime/src/kmp_stub.h
new file mode 100644
index 0000000..679c07b
--- /dev/null
+++ b/final/runtime/src/kmp_stub.h
@@ -0,0 +1,55 @@
+/*
+ * kmp_stub.h
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_STUB_H
+#define KMP_STUB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+void __kmps_set_blocktime(int arg);
+int __kmps_get_blocktime(void);
+void __kmps_set_dynamic(int arg);
+int __kmps_get_dynamic(void);
+void __kmps_set_library(int arg);
+int __kmps_get_library(void);
+void __kmps_set_nested(int arg);
+int __kmps_get_nested(void);
+void __kmps_set_stacksize(int arg);
+int __kmps_get_stacksize();
+
+#ifndef KMP_SCHED_TYPE_DEFINED
+#define KMP_SCHED_TYPE_DEFINED
+typedef enum kmp_sched {
+  kmp_sched_static = 1, // mapped to kmp_sch_static_chunked           (33)
+  kmp_sched_dynamic = 2, // mapped to kmp_sch_dynamic_chunked          (35)
+  kmp_sched_guided = 3, // mapped to kmp_sch_guided_chunked           (36)
+  kmp_sched_auto = 4, // mapped to kmp_sch_auto                     (38)
+  kmp_sched_default = kmp_sched_static // default scheduling
+} kmp_sched_t;
+#endif
+void __kmps_set_schedule(kmp_sched_t kind, int modifier);
+void __kmps_get_schedule(kmp_sched_t *kind, int *modifier);
+
+kmp_proc_bind_t __kmps_get_proc_bind(void);
+
+double __kmps_get_wtime();
+double __kmps_get_wtick();
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // KMP_STUB_H
+
+// end of file //
diff --git a/final/runtime/src/kmp_taskdeps.cpp b/final/runtime/src/kmp_taskdeps.cpp
new file mode 100644
index 0000000..db79dea
--- /dev/null
+++ b/final/runtime/src/kmp_taskdeps.cpp
@@ -0,0 +1,655 @@
+/*
+ * kmp_taskdeps.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//#define KMP_SUPPORT_GRAPH_OUTPUT 1
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_wait_release.h"
+#include "kmp_taskdeps.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+// TODO: Improve memory allocation? keep a list of pre-allocated structures?
+// allocate in blocks? re-use list finished list entries?
+// TODO: don't use atomic ref counters for stack-allocated nodes.
+// TODO: find an alternate to atomic refs for heap-allocated nodes?
+// TODO: Finish graph output support
+// TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other
+// runtime locks
+// TODO: Any ITT support needed?
+
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
+#endif
+
+static void __kmp_init_node(kmp_depnode_t *node) {
+  node->dn.successors = NULL;
+  node->dn.task = NULL; // will point to the rigth task
+  // once dependences have been processed
+  for (int i = 0; i < MAX_MTX_DEPS; ++i)
+    node->dn.mtx_locks[i] = NULL;
+  node->dn.mtx_num_locks = 0;
+  __kmp_init_lock(&node->dn.lock);
+  KMP_ATOMIC_ST_RLX(&node->dn.nrefs, 1); // init creates the first reference
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+  node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
+#endif
+}
+
+static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
+  KMP_ATOMIC_INC(&node->dn.nrefs);
+  return node;
+}
+
+enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
+
+static inline kmp_int32 __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
+  // TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) %
+  // m_num_sets );
+  return ((addr >> 6) ^ (addr >> 2)) % hsize;
+}
+
+static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
+                                           kmp_taskdata_t *current_task) {
+  kmp_dephash_t *h;
+
+  size_t h_size;
+
+  if (current_task->td_flags.tasktype == TASK_IMPLICIT)
+    h_size = KMP_DEPHASH_MASTER_SIZE;
+  else
+    h_size = KMP_DEPHASH_OTHER_SIZE;
+
+  kmp_int32 size =
+      h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
+
+#if USE_FAST_MEMORY
+  h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size);
+#else
+  h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size);
+#endif
+  h->size = h_size;
+
+#ifdef KMP_DEBUG
+  h->nelements = 0;
+  h->nconflicts = 0;
+#endif
+  h->buckets = (kmp_dephash_entry **)(h + 1);
+
+  for (size_t i = 0; i < h_size; i++)
+    h->buckets[i] = 0;
+
+  return h;
+}
+
+#define ENTRY_LAST_INS 0
+#define ENTRY_LAST_MTXS 1
+
+static kmp_dephash_entry *
+__kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr) {
+  kmp_int32 bucket = __kmp_dephash_hash(addr, h->size);
+
+  kmp_dephash_entry_t *entry;
+  for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
+    if (entry->addr == addr)
+      break;
+
+  if (entry == NULL) {
+// create entry. This is only done by one thread so no locking required
+#if USE_FAST_MEMORY
+    entry = (kmp_dephash_entry_t *)__kmp_fast_allocate(
+        thread, sizeof(kmp_dephash_entry_t));
+#else
+    entry = (kmp_dephash_entry_t *)__kmp_thread_malloc(
+        thread, sizeof(kmp_dephash_entry_t));
+#endif
+    entry->addr = addr;
+    entry->last_out = NULL;
+    entry->last_ins = NULL;
+    entry->last_mtxs = NULL;
+    entry->last_flag = ENTRY_LAST_INS;
+    entry->mtx_lock = NULL;
+    entry->next_in_bucket = h->buckets[bucket];
+    h->buckets[bucket] = entry;
+#ifdef KMP_DEBUG
+    h->nelements++;
+    if (entry->next_in_bucket)
+      h->nconflicts++;
+#endif
+  }
+  return entry;
+}
+
+static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
+                                          kmp_depnode_list_t *list,
+                                          kmp_depnode_t *node) {
+  kmp_depnode_list_t *new_head;
+
+#if USE_FAST_MEMORY
+  new_head = (kmp_depnode_list_t *)__kmp_fast_allocate(
+      thread, sizeof(kmp_depnode_list_t));
+#else
+  new_head = (kmp_depnode_list_t *)__kmp_thread_malloc(
+      thread, sizeof(kmp_depnode_list_t));
+#endif
+
+  new_head->node = __kmp_node_ref(node);
+  new_head->next = list;
+
+  return new_head;
+}
+
+static inline void __kmp_track_dependence(kmp_depnode_t *source,
+                                          kmp_depnode_t *sink,
+                                          kmp_task_t *sink_task) {
+#ifdef KMP_SUPPORT_GRAPH_OUTPUT
+  kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+  // do not use sink->dn.task as that is only filled after the dependencies
+  // are already processed!
+  kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+
+  __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id,
+               task_source->td_ident->psource, sink->dn.id,
+               task_sink->td_ident->psource);
+#endif
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  /* OMPT tracks dependences between task (a=source, b=sink) in which
+     task a blocks the execution of b through the ompt_new_dependence_callback
+     */
+  if (ompt_enabled.ompt_callback_task_dependence) {
+    kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
+    kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
+
+    ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
+        &(task_source->ompt_task_info.task_data),
+        &(task_sink->ompt_task_info.task_data));
+  }
+#endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
+}
+
+static inline kmp_int32
+__kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                             kmp_task_t *task, kmp_depnode_t *node,
+                             kmp_depnode_list_t *plist) {
+  if (!plist)
+    return 0;
+  kmp_int32 npredecessors = 0;
+  // link node as successor of list elements
+  for (kmp_depnode_list_t *p = plist; p; p = p->next) {
+    kmp_depnode_t *dep = p->node;
+    if (dep->dn.task) {
+      KMP_ACQUIRE_DEPNODE(gtid, dep);
+      if (dep->dn.task) {
+        __kmp_track_dependence(dep, node, task);
+        dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
+        KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+                      "%p\n",
+                      gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
+                      KMP_TASK_TO_TASKDATA(task)));
+        npredecessors++;
+      }
+      KMP_RELEASE_DEPNODE(gtid, dep);
+    }
+  }
+  return npredecessors;
+}
+
+static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
+                                                     kmp_info_t *thread,
+                                                     kmp_task_t *task,
+                                                     kmp_depnode_t *source,
+                                                     kmp_depnode_t *sink) {
+  if (!sink)
+    return 0;
+  kmp_int32 npredecessors = 0;
+  if (sink->dn.task) {
+    // synchronously add source to sink' list of successors
+    KMP_ACQUIRE_DEPNODE(gtid, sink);
+    if (sink->dn.task) {
+      __kmp_track_dependence(sink, source, task);
+      sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
+      KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+                    "%p\n",
+                    gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
+                    KMP_TASK_TO_TASKDATA(task)));
+      npredecessors++;
+    }
+    KMP_RELEASE_DEPNODE(gtid, sink);
+  }
+  return npredecessors;
+}
+
+template <bool filter>
+static inline kmp_int32
+__kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
+                   bool dep_barrier, kmp_int32 ndeps,
+                   kmp_depend_info_t *dep_list, kmp_task_t *task) {
+  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependencies : "
+                "dep_barrier = %d\n",
+                filter, gtid, ndeps, dep_barrier));
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 npredecessors = 0;
+  for (kmp_int32 i = 0; i < ndeps; i++) {
+    const kmp_depend_info_t *dep = &dep_list[i];
+
+    if (filter && dep->base_addr == 0)
+      continue; // skip filtered entries
+
+    kmp_dephash_entry_t *info =
+        __kmp_dephash_find(thread, hash, dep->base_addr);
+    kmp_depnode_t *last_out = info->last_out;
+    kmp_depnode_list_t *last_ins = info->last_ins;
+    kmp_depnode_list_t *last_mtxs = info->last_mtxs;
+
+    if (dep->flags.out) { // out --> clean lists of ins and mtxs if any
+      if (last_ins || last_mtxs) {
+        if (info->last_flag == ENTRY_LAST_INS) { // INS were last
+          npredecessors +=
+              __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
+        } else { // MTXS were last
+          npredecessors +=
+              __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
+        }
+        __kmp_depnode_list_free(thread, last_ins);
+        __kmp_depnode_list_free(thread, last_mtxs);
+        info->last_ins = NULL;
+        info->last_mtxs = NULL;
+      } else {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+      }
+      __kmp_node_deref(thread, last_out);
+      if (dep_barrier) {
+        // if this is a sync point in the serial sequence, then the previous
+        // outputs are guaranteed to be completed after the execution of this
+        // task so the previous output nodes can be cleared.
+        info->last_out = NULL;
+      } else {
+        info->last_out = __kmp_node_ref(node);
+      }
+    } else if (dep->flags.in) {
+      // in --> link node to either last_out or last_mtxs, clean earlier deps
+      if (last_mtxs) {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
+        __kmp_node_deref(thread, last_out);
+        info->last_out = NULL;
+        if (info->last_flag == ENTRY_LAST_MTXS && last_ins) { // MTXS were last
+          // clean old INS before creating new list
+          __kmp_depnode_list_free(thread, last_ins);
+          info->last_ins = NULL;
+        }
+      } else {
+        // link node as successor of the last_out if any
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+      }
+      info->last_flag = ENTRY_LAST_INS;
+      info->last_ins = __kmp_add_node(thread, info->last_ins, node);
+    } else {
+      KMP_DEBUG_ASSERT(dep->flags.mtx == 1);
+      // mtx --> link node to either last_out or last_ins, clean earlier deps
+      if (last_ins) {
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
+        __kmp_node_deref(thread, last_out);
+        info->last_out = NULL;
+        if (info->last_flag == ENTRY_LAST_INS && last_mtxs) { // INS were last
+          // clean old MTXS before creating new list
+          __kmp_depnode_list_free(thread, last_mtxs);
+          info->last_mtxs = NULL;
+        }
+      } else {
+        // link node as successor of the last_out if any
+        npredecessors +=
+            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+      }
+      info->last_flag = ENTRY_LAST_MTXS;
+      info->last_mtxs = __kmp_add_node(thread, info->last_mtxs, node);
+      if (info->mtx_lock == NULL) {
+        info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+        __kmp_init_lock(info->mtx_lock);
+      }
+      KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
+      kmp_int32 m;
+      // Save lock in node's array
+      for (m = 0; m < MAX_MTX_DEPS; ++m) {
+        // sort pointers in decreasing order to avoid potential livelock
+        if (node->dn.mtx_locks[m] < info->mtx_lock) {
+          KMP_DEBUG_ASSERT(node->dn.mtx_locks[node->dn.mtx_num_locks] == NULL);
+          for (int n = node->dn.mtx_num_locks; n > m; --n) {
+            // shift right all lesser non-NULL pointers
+            KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
+            node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
+          }
+          node->dn.mtx_locks[m] = info->mtx_lock;
+          break;
+        }
+      }
+      KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
+      node->dn.mtx_num_locks++;
+    }
+  }
+  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
+                gtid, npredecessors));
+  return npredecessors;
+}
+
+#define NO_DEP_BARRIER (false)
+#define DEP_BARRIER (true)
+
+// returns true if the task has any outstanding dependence
+static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
+                             kmp_task_t *task, kmp_dephash_t *hash,
+                             bool dep_barrier, kmp_int32 ndeps,
+                             kmp_depend_info_t *dep_list,
+                             kmp_int32 ndeps_noalias,
+                             kmp_depend_info_t *noalias_dep_list) {
+  int i, n_mtxs = 0;
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+#endif
+  KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependencies for task %p : %d "
+                "possibly aliased dependencies, %d non-aliased depedencies : "
+                "dep_barrier=%d .\n",
+                gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
+
+  // Filter deps in dep_list
+  // TODO: Different algorithm for large dep_list ( > 10 ? )
+  for (i = 0; i < ndeps; i++) {
+    if (dep_list[i].base_addr != 0) {
+      for (int j = i + 1; j < ndeps; j++) {
+        if (dep_list[i].base_addr == dep_list[j].base_addr) {
+          dep_list[i].flags.in |= dep_list[j].flags.in;
+          dep_list[i].flags.out |=
+              (dep_list[j].flags.out ||
+               (dep_list[i].flags.in && dep_list[j].flags.mtx) ||
+               (dep_list[i].flags.mtx && dep_list[j].flags.in));
+          dep_list[i].flags.mtx =
+              dep_list[i].flags.mtx | dep_list[j].flags.mtx &&
+              !dep_list[i].flags.out;
+          dep_list[j].base_addr = 0; // Mark j element as void
+        }
+      }
+      if (dep_list[i].flags.mtx) {
+        // limit number of mtx deps to MAX_MTX_DEPS per node
+        if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
+          ++n_mtxs;
+        } else {
+          dep_list[i].flags.in = 1; // downgrade mutexinoutset to inout
+          dep_list[i].flags.out = 1;
+          dep_list[i].flags.mtx = 0;
+        }
+      }
+    }
+  }
+
+  // doesn't need to be atomic as no other thread is going to be accessing this
+  // node just yet.
+  // npredecessors is set -1 to ensure that none of the releasing tasks queues
+  // this task before we have finished processing all the dependencies
+  node->dn.npredecessors = -1;
+
+  // used to pack all npredecessors additions into a single atomic operation at
+  // the end
+  int npredecessors;
+
+  npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps,
+                                           dep_list, task);
+  npredecessors += __kmp_process_deps<false>(
+      gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
+
+  node->dn.task = task;
+  KMP_MB();
+
+  // Account for our initial fake value
+  npredecessors++;
+
+  // Update predecessors and obtain current value to check if there are still
+  // any outstandig dependences (some tasks may have finished while we processed
+  // the dependences)
+  npredecessors =
+      node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
+
+  KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
+                gtid, npredecessors, taskdata));
+
+  // beyond this point the task could be queued (and executed) by a releasing
+  // task...
+  return npredecessors > 0 ? true : false;
+}
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new
+task''
+@param ndeps Number of depend items with possible aliasing
+@param dep_list List of depend items with possible aliasing
+@param ndeps_noalias Number of depend items with no aliasing
+@param noalias_dep_list List of depend items with no aliasing
+
+@return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
+suspendend and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
+
+Schedule a non-thread-switchable task with dependences for execution
+*/
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_task_t *new_task, kmp_int32 ndeps,
+                                    kmp_depend_info_t *dep_list,
+                                    kmp_int32 ndeps_noalias,
+                                    kmp_depend_info_t *noalias_dep_list) {
+
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+  KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, new_taskdata));
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    if (!current_task->ompt_task_info.frame.enter_frame.ptr)
+      current_task->ompt_task_info.frame.enter_frame.ptr =
+          OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
+          current_task ? &(current_task->ompt_task_info.frame) : NULL,
+          &(new_taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
+          OMPT_LOAD_RETURN_ADDRESS(gtid));
+    }
+
+    new_taskdata->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+
+#if OMPT_OPTIONAL
+  /* OMPT grab all dependences if requested by the tool */
+  if (ndeps + ndeps_noalias > 0 &&
+      ompt_enabled.ompt_callback_dependences) {
+    kmp_int32 i;
+
+    new_taskdata->ompt_task_info.ndeps = ndeps + ndeps_noalias;
+    new_taskdata->ompt_task_info.deps =
+        (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
+            thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
+
+    KMP_ASSERT(new_taskdata->ompt_task_info.deps != NULL);
+
+    for (i = 0; i < ndeps; i++) {
+      new_taskdata->ompt_task_info.deps[i].variable.ptr =
+          (void *)dep_list[i].base_addr;
+      if (dep_list[i].flags.in && dep_list[i].flags.out)
+        new_taskdata->ompt_task_info.deps[i].dependence_type =
+            ompt_dependence_type_inout;
+      else if (dep_list[i].flags.out)
+        new_taskdata->ompt_task_info.deps[i].dependence_type =
+            ompt_dependence_type_out;
+      else if (dep_list[i].flags.in)
+        new_taskdata->ompt_task_info.deps[i].dependence_type =
+            ompt_dependence_type_in;
+    }
+    for (i = 0; i < ndeps_noalias; i++) {
+      new_taskdata->ompt_task_info.deps[ndeps + i].variable.ptr =
+          (void *)noalias_dep_list[i].base_addr;
+      if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
+        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
+            ompt_dependence_type_inout;
+      else if (noalias_dep_list[i].flags.out)
+        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
+            ompt_dependence_type_out;
+      else if (noalias_dep_list[i].flags.in)
+        new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
+            ompt_dependence_type_in;
+    }
+    ompt_callbacks.ompt_callback(ompt_callback_dependences)(
+        &(new_taskdata->ompt_task_info.task_data),
+        new_taskdata->ompt_task_info.deps, new_taskdata->ompt_task_info.ndeps);
+    /* We can now free the allocated memory for the dependencies */
+    /* For OMPD we might want to delay the free until task_end */
+    KMP_OMPT_DEPS_FREE(thread, new_taskdata->ompt_task_info.deps);
+    new_taskdata->ompt_task_info.deps = NULL;
+    new_taskdata->ompt_task_info.ndeps = 0;
+  }
+#endif /* OMPT_OPTIONAL */
+#endif /* OMPT_SUPPORT */
+
+  bool serial = current_task->td_flags.team_serial ||
+                current_task->td_flags.tasking_ser ||
+                current_task->td_flags.final;
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+  serial = serial && !(task_team && task_team->tt.tt_found_proxy_tasks);
+
+  if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
+    /* if no dependencies have been tracked yet, create the dependence hash */
+    if (current_task->td_dephash == NULL)
+      current_task->td_dephash = __kmp_dephash_create(thread, current_task);
+
+#if USE_FAST_MEMORY
+    kmp_depnode_t *node =
+        (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
+#else
+    kmp_depnode_t *node =
+        (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
+#endif
+
+    __kmp_init_node(node);
+    new_taskdata->td_depnode = node;
+
+    if (__kmp_check_deps(gtid, node, new_task, current_task->td_dephash,
+                         NO_DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
+                         noalias_dep_list)) {
+      KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking "
+                    "dependencies: "
+                    "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
+                    gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
+      }
+#endif
+      return TASK_CURRENT_NOT_QUEUED;
+    }
+  } else {
+    KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies "
+                  "for task (serialized)"
+                  "loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata));
+  }
+
+  KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking "
+                "dependencies : "
+                "loc=%p task=%p, transferring to __kmp_omp_task\n",
+                gtid, loc_ref, new_taskdata));
+
+  kmp_int32 ret = __kmp_omp_task(gtid, new_task, true);
+#if OMPT_SUPPORT
+  if (ompt_enabled.enabled) {
+    current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return ret;
+}
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param ndeps Number of depend items with possible aliasing
+@param dep_list List of depend items with possible aliasing
+@param ndeps_noalias Number of depend items with no aliasing
+@param noalias_dep_list List of depend items with no aliasing
+
+Blocks the current task until all specifies dependencies have been fulfilled.
+*/
+void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
+                          kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
+                          kmp_depend_info_t *noalias_dep_list) {
+  KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref));
+
+  if (ndeps == 0 && ndeps_noalias == 0) {
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependencies to "
+                  "wait upon : loc=%p\n",
+                  gtid, loc_ref));
+    return;
+  }
+
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
+  // We can return immediately as:
+  // - dependences are not computed in serial teams (except with proxy tasks)
+  // - if the dephash is not yet created it means we have nothing to wait for
+  bool ignore = current_task->td_flags.team_serial ||
+                current_task->td_flags.tasking_ser ||
+                current_task->td_flags.final;
+  ignore = ignore && thread->th.th_task_team != NULL &&
+           thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
+  ignore = ignore || current_task->td_dephash == NULL;
+
+  if (ignore) {
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
+                  "dependencies : loc=%p\n",
+                  gtid, loc_ref));
+    return;
+  }
+
+  kmp_depnode_t node = {0};
+  __kmp_init_node(&node);
+
+  if (!__kmp_check_deps(gtid, &node, NULL, current_task->td_dephash,
+                        DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
+                        noalias_dep_list)) {
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
+                  "dependencies : loc=%p\n",
+                  gtid, loc_ref));
+    return;
+  }
+
+  int thread_finished = FALSE;
+  kmp_flag_32 flag((std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
+  while (node.dn.npredecessors > 0) {
+    flag.execute_tasks(thread, gtid, FALSE,
+                       &thread_finished USE_ITT_BUILD_ARG(NULL),
+                       __kmp_task_stealing_constraint);
+  }
+
+  KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n",
+                gtid, loc_ref));
+}
diff --git a/final/runtime/src/kmp_taskdeps.h b/final/runtime/src/kmp_taskdeps.h
new file mode 100644
index 0000000..2a712b3
--- /dev/null
+++ b/final/runtime/src/kmp_taskdeps.h
@@ -0,0 +1,145 @@
+/*
+ * kmp_taskdeps.h
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_TASKDEPS_H
+#define KMP_TASKDEPS_H
+
+#include "kmp.h"
+
+#define KMP_ACQUIRE_DEPNODE(gtid, n) __kmp_acquire_lock(&(n)->dn.lock, (gtid))
+#define KMP_RELEASE_DEPNODE(gtid, n) __kmp_release_lock(&(n)->dn.lock, (gtid))
+
+static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
+  if (!node)
+    return;
+
+  kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
+  if (n == 0) {
+    KMP_ASSERT(node->dn.nrefs == 0);
+#if USE_FAST_MEMORY
+    __kmp_fast_free(thread, node);
+#else
+    __kmp_thread_free(thread, node);
+#endif
+  }
+}
+
+static inline void __kmp_depnode_list_free(kmp_info_t *thread,
+                                           kmp_depnode_list *list) {
+  kmp_depnode_list *next;
+
+  for (; list; list = next) {
+    next = list->next;
+
+    __kmp_node_deref(thread, list->node);
+#if USE_FAST_MEMORY
+    __kmp_fast_free(thread, list);
+#else
+    __kmp_thread_free(thread, list);
+#endif
+  }
+}
+
+static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
+                                              kmp_dephash_t *h) {
+  for (size_t i = 0; i < h->size; i++) {
+    if (h->buckets[i]) {
+      kmp_dephash_entry_t *next;
+      for (kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next) {
+        next = entry->next_in_bucket;
+        __kmp_depnode_list_free(thread, entry->last_ins);
+        __kmp_depnode_list_free(thread, entry->last_mtxs);
+        __kmp_node_deref(thread, entry->last_out);
+        if (entry->mtx_lock) {
+          __kmp_destroy_lock(entry->mtx_lock);
+          __kmp_free(entry->mtx_lock);
+        }
+#if USE_FAST_MEMORY
+        __kmp_fast_free(thread, entry);
+#else
+        __kmp_thread_free(thread, entry);
+#endif
+      }
+      h->buckets[i] = 0;
+    }
+  }
+}
+
+static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
+  __kmp_dephash_free_entries(thread, h);
+#if USE_FAST_MEMORY
+  __kmp_fast_free(thread, h);
+#else
+  __kmp_thread_free(thread, h);
+#endif
+}
+
+static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_depnode_t *node = task->td_depnode;
+
+  if (task->td_dephash) {
+    KA_TRACE(
+        40, ("__kmp_release_deps: T#%d freeing dependencies hash of task %p.\n",
+             gtid, task));
+    __kmp_dephash_free(thread, task->td_dephash);
+    task->td_dephash = NULL;
+  }
+
+  if (!node)
+    return;
+
+  KA_TRACE(20, ("__kmp_release_deps: T#%d notifying successors of task %p.\n",
+                gtid, task));
+
+  KMP_ACQUIRE_DEPNODE(gtid, node);
+  node->dn.task =
+      NULL; // mark this task as finished, so no new dependencies are generated
+  KMP_RELEASE_DEPNODE(gtid, node);
+
+  kmp_depnode_list_t *next;
+  for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) {
+    kmp_depnode_t *successor = p->node;
+    kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->dn.npredecessors) - 1;
+
+    // successor task can be NULL for wait_depends or because deps are still
+    // being processed
+    if (npredecessors == 0) {
+      KMP_MB();
+      if (successor->dn.task) {
+        KA_TRACE(20, ("__kmp_release_deps: T#%d successor %p of %p scheduled "
+                      "for execution.\n",
+                      gtid, successor->dn.task, task));
+        __kmp_omp_task(gtid, successor->dn.task, false);
+      }
+    }
+
+    next = p->next;
+    __kmp_node_deref(thread, p->node);
+#if USE_FAST_MEMORY
+    __kmp_fast_free(thread, p);
+#else
+    __kmp_thread_free(thread, p);
+#endif
+  }
+
+  __kmp_node_deref(thread, node);
+
+  KA_TRACE(
+      20,
+      ("__kmp_release_deps: T#%d all successors of %p notified of completion\n",
+       gtid, task));
+}
+
+#endif // KMP_TASKDEPS_H
diff --git a/final/runtime/src/kmp_tasking.cpp b/final/runtime/src/kmp_tasking.cpp
new file mode 100644
index 0000000..d037299
--- /dev/null
+++ b/final/runtime/src/kmp_tasking.cpp
@@ -0,0 +1,4532 @@
+/*
+ * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_wait_release.h"
+#include "kmp_taskdeps.h"
+
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+#include "tsan_annotations.h"
+
+/* forward declaration */
+static void __kmp_enable_tasking(kmp_task_team_t *task_team,
+                                 kmp_info_t *this_thr);
+static void __kmp_alloc_task_deque(kmp_info_t *thread,
+                                   kmp_thread_data_t *thread_data);
+static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
+                                           kmp_task_team_t *task_team);
+static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
+
+#ifdef BUILD_TIED_TASK_STACK
+
+//  __kmp_trace_task_stack: print the tied tasks from the task stack in order
+//  from top do bottom
+//
+//  gtid: global thread identifier for thread containing stack
+//  thread_data: thread data for task team thread containing stack
+//  threshold: value above which the trace statement triggers
+//  location: string identifying call site of this function (for trace)
+static void __kmp_trace_task_stack(kmp_int32 gtid,
+                                   kmp_thread_data_t *thread_data,
+                                   int threshold, char *location) {
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_taskdata_t **stack_top = task_stack->ts_top;
+  kmp_int32 entries = task_stack->ts_entries;
+  kmp_taskdata_t *tied_task;
+
+  KA_TRACE(
+      threshold,
+      ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
+       "first_block = %p, stack_top = %p \n",
+       location, gtid, entries, task_stack->ts_first_block, stack_top));
+
+  KMP_DEBUG_ASSERT(stack_top != NULL);
+  KMP_DEBUG_ASSERT(entries > 0);
+
+  while (entries != 0) {
+    KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
+    // fix up ts_top if we need to pop from previous block
+    if (entries & TASK_STACK_INDEX_MASK == 0) {
+      kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
+
+      stack_block = stack_block->sb_prev;
+      stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
+    }
+
+    // finish bookkeeping
+    stack_top--;
+    entries--;
+
+    tied_task = *stack_top;
+
+    KMP_DEBUG_ASSERT(tied_task != NULL);
+    KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
+
+    KA_TRACE(threshold,
+             ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
+              "stack_top=%p, tied_task=%p\n",
+              location, gtid, entries, stack_top, tied_task));
+  }
+  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
+
+  KA_TRACE(threshold,
+           ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
+            location, gtid));
+}
+
+//  __kmp_init_task_stack: initialize the task stack for the first time
+//  after a thread_data structure is created.
+//  It should not be necessary to do this again (assuming the stack works).
+//
+//  gtid: global thread identifier of calling thread
+//  thread_data: thread data for task team thread containing stack
+static void __kmp_init_task_stack(kmp_int32 gtid,
+                                  kmp_thread_data_t *thread_data) {
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_stack_block_t *first_block;
+
+  // set up the first block of the stack
+  first_block = &task_stack->ts_first_block;
+  task_stack->ts_top = (kmp_taskdata_t **)first_block;
+  memset((void *)first_block, '\0',
+         TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
+
+  // initialize the stack to be empty
+  task_stack->ts_entries = TASK_STACK_EMPTY;
+  first_block->sb_next = NULL;
+  first_block->sb_prev = NULL;
+}
+
+//  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
+//
+//  gtid: global thread identifier for calling thread
+//  thread_data: thread info for thread containing stack
+static void __kmp_free_task_stack(kmp_int32 gtid,
+                                  kmp_thread_data_t *thread_data) {
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
+
+  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
+  // free from the second block of the stack
+  while (stack_block != NULL) {
+    kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
+
+    stack_block->sb_next = NULL;
+    stack_block->sb_prev = NULL;
+    if (stack_block != &task_stack->ts_first_block) {
+      __kmp_thread_free(thread,
+                        stack_block); // free the block, if not the first
+    }
+    stack_block = next_block;
+  }
+  // initialize the stack to be empty
+  task_stack->ts_entries = 0;
+  task_stack->ts_top = NULL;
+}
+
+//  __kmp_push_task_stack: Push the tied task onto the task stack.
+//     Grow the stack if necessary by allocating another block.
+//
+//  gtid: global thread identifier for calling thread
+//  thread: thread info for thread containing stack
+//  tied_task: the task to push on the stack
+static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_taskdata_t *tied_task) {
+  // GEH - need to consider what to do if tt_threads_data not allocated yet
+  kmp_thread_data_t *thread_data =
+      &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+
+  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
+    return; // Don't push anything on stack if team or team tasks are serialized
+  }
+
+  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
+  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
+
+  KA_TRACE(20,
+           ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
+            gtid, thread, tied_task));
+  // Store entry
+  *(task_stack->ts_top) = tied_task;
+
+  // Do bookkeeping for next push
+  task_stack->ts_top++;
+  task_stack->ts_entries++;
+
+  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
+    // Find beginning of this task block
+    kmp_stack_block_t *stack_block =
+        (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
+
+    // Check if we already have a block
+    if (stack_block->sb_next !=
+        NULL) { // reset ts_top to beginning of next block
+      task_stack->ts_top = &stack_block->sb_next->sb_block[0];
+    } else { // Alloc new block and link it up
+      kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
+          thread, sizeof(kmp_stack_block_t));
+
+      task_stack->ts_top = &new_block->sb_block[0];
+      stack_block->sb_next = new_block;
+      new_block->sb_prev = stack_block;
+      new_block->sb_next = NULL;
+
+      KA_TRACE(
+          30,
+          ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
+           gtid, tied_task, new_block));
+    }
+  }
+  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
+                tied_task));
+}
+
+//  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
+//  the task, just check to make sure it matches the ending task passed in.
+//
+//  gtid: global thread identifier for the calling thread
+//  thread: thread info structure containing stack
+//  tied_task: the task popped off the stack
+//  ending_task: the task that is ending (should match popped task)
+static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
+                                 kmp_taskdata_t *ending_task) {
+  // GEH - need to consider what to do if tt_threads_data not allocated yet
+  kmp_thread_data_t *thread_data =
+      &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
+  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
+  kmp_taskdata_t *tied_task;
+
+  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
+    // Don't pop anything from stack if team or team tasks are serialized
+    return;
+  }
+
+  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
+  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
+
+  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
+                thread));
+
+  // fix up ts_top if we need to pop from previous block
+  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
+    kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
+
+    stack_block = stack_block->sb_prev;
+    task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
+  }
+
+  // finish bookkeeping
+  task_stack->ts_top--;
+  task_stack->ts_entries--;
+
+  tied_task = *(task_stack->ts_top);
+
+  KMP_DEBUG_ASSERT(tied_task != NULL);
+  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
+  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
+
+  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
+                tied_task));
+  return;
+}
+#endif /* BUILD_TIED_TASK_STACK */
+
+// returns 1 if new task is allowed to execute, 0 otherwise
+// checks Task Scheduling constraint (if requested) and
+// mutexinoutset dependencies if any
+static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
+                                  const kmp_taskdata_t *tasknew,
+                                  const kmp_taskdata_t *taskcurr) {
+  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
+    // Check if the candidate obeys the Task Scheduling Constraints (TSC)
+    // only descendant of all deferred tied tasks can be scheduled, checking
+    // the last one is enough, as it in turn is the descendant of all others
+    kmp_taskdata_t *current = taskcurr->td_last_tied;
+    KMP_DEBUG_ASSERT(current != NULL);
+    // check if the task is not suspended on barrier
+    if (current->td_flags.tasktype == TASK_EXPLICIT ||
+        current->td_taskwait_thread > 0) { // <= 0 on barrier
+      kmp_int32 level = current->td_level;
+      kmp_taskdata_t *parent = tasknew->td_parent;
+      while (parent != current && parent->td_level > level) {
+        // check generation up to the level of the current task
+        parent = parent->td_parent;
+        KMP_DEBUG_ASSERT(parent != NULL);
+      }
+      if (parent != current)
+        return false;
+    }
+  }
+  // Check mutexinoutset dependencies, acquire locks
+  kmp_depnode_t *node = tasknew->td_depnode;
+  if (node && (node->dn.mtx_num_locks > 0)) {
+    for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
+      KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
+      if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
+        continue;
+      // could not get the lock, release previous locks
+      for (int j = i - 1; j >= 0; --j)
+        __kmp_release_lock(node->dn.mtx_locks[j], gtid);
+      return false;
+    }
+    // negative num_locks means all locks acquired successfully
+    node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
+  }
+  return true;
+}
+
+// __kmp_realloc_task_deque:
+// Re-allocates a task deque for a particular thread, copies the content from
+// the old deque and adjusts the necessary data structures relating to the
+// deque. This operation must be done with the deque_lock being held
+static void __kmp_realloc_task_deque(kmp_info_t *thread,
+                                     kmp_thread_data_t *thread_data) {
+  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
+  kmp_int32 new_size = 2 * size;
+
+  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
+                "%d] for thread_data %p\n",
+                __kmp_gtid_from_thread(thread), size, new_size, thread_data));
+
+  kmp_taskdata_t **new_deque =
+      (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
+
+  int i, j;
+  for (i = thread_data->td.td_deque_head, j = 0; j < size;
+       i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
+    new_deque[j] = thread_data->td.td_deque[i];
+
+  __kmp_free(thread_data->td.td_deque);
+
+  thread_data->td.td_deque_head = 0;
+  thread_data->td.td_deque_tail = size;
+  thread_data->td.td_deque = new_deque;
+  thread_data->td.td_deque_size = new_size;
+}
+
+//  __kmp_push_task: Add a task to the thread's deque
+static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
+  kmp_thread_data_t *thread_data;
+
+  KA_TRACE(20,
+           ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
+
+  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+    // untied task needs to increment counter so that the task structure is not
+    // freed prematurely
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
+    KMP_DEBUG_USE_VAR(counter);
+    KA_TRACE(
+        20,
+        ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
+         gtid, counter, taskdata));
+  }
+
+  // The first check avoids building task_team thread data if serialized
+  if (taskdata->td_flags.task_serial) {
+    KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
+                  "TASK_NOT_PUSHED for task %p\n",
+                  gtid, taskdata));
+    return TASK_NOT_PUSHED;
+  }
+
+  // Now that serialized tasks have returned, we can assume that we are not in
+  // immediate exec mode
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  if (!KMP_TASKING_ENABLED(task_team)) {
+    __kmp_enable_tasking(task_team, thread);
+  }
+  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
+  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
+
+  // Find tasking deque specific to encountering thread
+  thread_data = &task_team->tt.tt_threads_data[tid];
+
+  // No lock needed since only owner can allocate
+  if (thread_data->td.td_deque == NULL) {
+    __kmp_alloc_task_deque(thread, thread_data);
+  }
+
+  int locked = 0;
+  // Check if deque is full
+  if (TCR_4(thread_data->td.td_deque_ntasks) >=
+      TASK_DEQUE_SIZE(thread_data->td)) {
+    if (__kmp_enable_task_throttling &&
+        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+                              thread->th.th_current_task)) {
+      KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
+                    "TASK_NOT_PUSHED for task %p\n",
+                    gtid, taskdata));
+      return TASK_NOT_PUSHED;
+    } else {
+      __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+      locked = 1;
+      // expand deque to push the task which is not allowed to execute
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
+  }
+  // Lock the deque for the task push operation
+  if (!locked) {
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+    // Need to recheck as we can get a proxy task from thread outside of OpenMP
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      if (__kmp_enable_task_throttling &&
+          __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+                                thread->th.th_current_task)) {
+        __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+        KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
+                      "returning TASK_NOT_PUSHED for task %p\n",
+                      gtid, taskdata));
+        return TASK_NOT_PUSHED;
+      } else {
+        // expand deque to push the task which is not allowed to execute
+        __kmp_realloc_task_deque(thread, thread_data);
+      }
+    }
+  }
+  // Must have room since no thread can add tasks but calling thread
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
+                   TASK_DEQUE_SIZE(thread_data->td));
+
+  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
+      taskdata; // Push taskdata
+  // Wrap index.
+  thread_data->td.td_deque_tail =
+      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
+  TCW_4(thread_data->td.td_deque_ntasks,
+        TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
+
+  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
+                "task=%p ntasks=%d head=%u tail=%u\n",
+                gtid, taskdata, thread_data->td.td_deque_ntasks,
+                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  return TASK_SUCCESSFULLY_PUSHED;
+}
+
+// __kmp_pop_current_task_from_thread: set up current task from called thread
+// when team ends
+//
+// this_thr: thread structure to set current_task in.
+void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
+  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
+                "this_thread=%p, curtask=%p, "
+                "curtask_parent=%p\n",
+                0, this_thr, this_thr->th.th_current_task,
+                this_thr->th.th_current_task->td_parent));
+
+  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
+
+  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
+                "this_thread=%p, curtask=%p, "
+                "curtask_parent=%p\n",
+                0, this_thr, this_thr->th.th_current_task,
+                this_thr->th.th_current_task->td_parent));
+}
+
+// __kmp_push_current_task_to_thread: set up current task in called thread for a
+// new team
+//
+// this_thr: thread structure to set up
+// team: team for implicit task data
+// tid: thread within team to set up
+void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
+                                       int tid) {
+  // current task of the thread is a parent of the new just created implicit
+  // tasks of new team
+  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
+                "curtask=%p "
+                "parent_task=%p\n",
+                tid, this_thr, this_thr->th.th_current_task,
+                team->t.t_implicit_task_taskdata[tid].td_parent));
+
+  KMP_DEBUG_ASSERT(this_thr != NULL);
+
+  if (tid == 0) {
+    if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
+      team->t.t_implicit_task_taskdata[0].td_parent =
+          this_thr->th.th_current_task;
+      this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
+    }
+  } else {
+    team->t.t_implicit_task_taskdata[tid].td_parent =
+        team->t.t_implicit_task_taskdata[0].td_parent;
+    this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
+  }
+
+  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
+                "curtask=%p "
+                "parent_task=%p\n",
+                tid, this_thr, this_thr->th.th_current_task,
+                team->t.t_implicit_task_taskdata[tid].td_parent));
+}
+
+// __kmp_task_start: bookkeeping for a task starting execution
+//
+// GTID: global thread id of calling thread
+// task: task starting execution
+// current_task: task suspending
+static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
+                             kmp_taskdata_t *current_task) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  KA_TRACE(10,
+           ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
+            gtid, taskdata, current_task));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+
+  // mark currently executing task as suspended
+  // TODO: GEH - make sure root team implicit task is initialized properly.
+  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
+  current_task->td_flags.executing = 0;
+
+// Add task to stack if tied
+#ifdef BUILD_TIED_TASK_STACK
+  if (taskdata->td_flags.tiedness == TASK_TIED) {
+    __kmp_push_task_stack(gtid, thread, taskdata);
+  }
+#endif /* BUILD_TIED_TASK_STACK */
+
+  // mark starting task as executing and as current task
+  thread->th.th_current_task = taskdata;
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
+                   taskdata->td_flags.tiedness == TASK_UNTIED);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
+                   taskdata->td_flags.tiedness == TASK_UNTIED);
+  taskdata->td_flags.started = 1;
+  taskdata->td_flags.executing = 1;
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  // GEH TODO: shouldn't we pass some sort of location identifier here?
+  // APT: yes, we will pass location here.
+  // need to store current thread state (in a thread or taskdata structure)
+  // before setting work_state, otherwise wrong state is set after end of task
+
+  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
+
+  return;
+}
+
+#if OMPT_SUPPORT
+//------------------------------------------------------------------------------
+// __ompt_task_init:
+//   Initialize OMPT fields maintained by a task. This will only be called after
+//   ompt_start_tool, so we already know whether ompt is enabled or not.
+
+static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
+  // The calls to __ompt_task_init already have the ompt_enabled condition.
+  task->ompt_task_info.task_data.value = 0;
+  task->ompt_task_info.frame.exit_frame = ompt_data_none;
+  task->ompt_task_info.frame.enter_frame = ompt_data_none;
+  task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
+  task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
+  task->ompt_task_info.ndeps = 0;
+  task->ompt_task_info.deps = NULL;
+}
+
+// __ompt_task_start:
+//   Build and trigger task-begin event
+static inline void __ompt_task_start(kmp_task_t *task,
+                                     kmp_taskdata_t *current_task,
+                                     kmp_int32 gtid) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  ompt_task_status_t status = ompt_task_switch;
+  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
+    status = ompt_task_yield;
+    __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
+  }
+  /* let OMPT know that we're about to run this task */
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        &(current_task->ompt_task_info.task_data), status,
+        &(taskdata->ompt_task_info.task_data));
+  }
+  taskdata->ompt_task_info.scheduling_parent = current_task;
+}
+
+// __ompt_task_finish:
+//   Build and trigger final task-schedule event
+static inline void
+__ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task,
+                   ompt_task_status_t status = ompt_task_complete) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
+      taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
+    status = ompt_task_cancel;
+  }
+
+  /* let OMPT know that we're returning to the callee task */
+  if (ompt_enabled.ompt_callback_task_schedule) {
+    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
+        &(taskdata->ompt_task_info.task_data), status,
+        &((resumed_task ? resumed_task
+                        : (taskdata->ompt_task_info.scheduling_parent
+                               ? taskdata->ompt_task_info.scheduling_parent
+                               : taskdata->td_parent))
+              ->ompt_task_info.task_data));
+  }
+}
+#endif
+
+template <bool ompt>
+static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
+                                               kmp_task_t *task,
+                                               void *frame_address,
+                                               void *return_address) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+
+  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
+                "current_task=%p\n",
+                gtid, loc_ref, taskdata, current_task));
+
+  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+    // untied task needs to increment counter so that the task structure is not
+    // freed prematurely
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
+    KMP_DEBUG_USE_VAR(counter);
+    KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
+                  "incremented for task %p\n",
+                  gtid, counter, taskdata));
+  }
+
+  taskdata->td_flags.task_serial =
+      1; // Execute this task immediately, not deferred.
+  __kmp_task_start(gtid, task, current_task);
+
+#if OMPT_SUPPORT
+  if (ompt) {
+    if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
+      current_task->ompt_task_info.frame.enter_frame.ptr =
+          taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
+      current_task->ompt_task_info.frame.enter_frame_flags =
+          taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
+    }
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          &(parent_info->task_data), &(parent_info->frame),
+          &(taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
+          return_address);
+    }
+    __ompt_task_start(task, current_task, gtid);
+  }
+#endif // OMPT_SUPPORT
+
+  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
+                loc_ref, taskdata));
+}
+
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                           kmp_task_t *task,
+                                           void *frame_address,
+                                           void *return_address) {
+  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
+                                           return_address);
+}
+#endif // OMPT_SUPPORT
+
+// __kmpc_omp_task_begin_if0: report that a given serialized task has started
+// execution
+//
+// loc_ref: source location information; points to beginning of task block.
+// gtid: global thread number.
+// task: task thunk for the started task.
+void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
+                               kmp_task_t *task) {
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
+                                   OMPT_GET_FRAME_ADDRESS(1),
+                                   OMPT_LOAD_RETURN_ADDRESS(gtid));
+    return;
+  }
+#endif
+  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
+}
+
+#ifdef TASK_UNUSED
+// __kmpc_omp_task_begin: report that a given task has started execution
+// NEVER GENERATED BY COMPILER, DEPRECATED!!!
+void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
+  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+
+  KA_TRACE(
+      10,
+      ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
+       gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
+
+  __kmp_task_start(gtid, task, current_task);
+
+  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
+                loc_ref, KMP_TASK_TO_TASKDATA(task)));
+  return;
+}
+#endif // TASK_UNUSED
+
+// __kmp_free_task: free the current task space and the space for shareds
+//
+// gtid: Global thread ID of calling thread
+// taskdata: task to free
+// thread: thread data structure of caller
+static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
+                            kmp_info_t *thread) {
+  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
+                taskdata));
+
+  // Check to make sure all flags and counters have the correct values
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
+                   taskdata->td_flags.task_serial == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
+
+  taskdata->td_flags.freed = 1;
+  ANNOTATE_HAPPENS_BEFORE(taskdata);
+// deallocate the taskdata and shared variable blocks associated with this task
+#if USE_FAST_MEMORY
+  __kmp_fast_free(thread, taskdata);
+#else /* ! USE_FAST_MEMORY */
+  __kmp_thread_free(thread, taskdata);
+#endif
+
+  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
+}
+
+// __kmp_free_task_and_ancestors: free the current task and ancestors without
+// children
+//
+// gtid: Global thread ID of calling thread
+// taskdata: task to free
+// thread: thread data structure of caller
+static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
+                                          kmp_taskdata_t *taskdata,
+                                          kmp_info_t *thread) {
+  // Proxy tasks must always be allowed to free their parents
+  // because they can be run in background even in serial mode.
+  kmp_int32 team_serial =
+      (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
+      !taskdata->td_flags.proxy;
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+
+  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
+  KMP_DEBUG_ASSERT(children >= 0);
+
+  // Now, go up the ancestor tree to see if any ancestors can now be freed.
+  while (children == 0) {
+    kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
+
+    KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
+                  "and freeing itself\n",
+                  gtid, taskdata));
+
+    // --- Deallocate my ancestor task ---
+    __kmp_free_task(gtid, taskdata, thread);
+
+    taskdata = parent_taskdata;
+
+    if (team_serial)
+      return;
+    // Stop checking ancestors at implicit task instead of walking up ancestor
+    // tree to avoid premature deallocation of ancestors.
+    if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
+      if (taskdata->td_dephash) { // do we need to cleanup dephash?
+        int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
+        kmp_tasking_flags_t flags_old = taskdata->td_flags;
+        if (children == 0 && flags_old.complete == 1) {
+          kmp_tasking_flags_t flags_new = flags_old;
+          flags_new.complete = 0;
+          if (KMP_COMPARE_AND_STORE_ACQ32(
+                  RCAST(kmp_int32 *, &taskdata->td_flags),
+                  *RCAST(kmp_int32 *, &flags_old),
+                  *RCAST(kmp_int32 *, &flags_new))) {
+            KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
+                           "dephash of implicit task %p\n",
+                           gtid, taskdata));
+            // cleanup dephash of finished implicit task
+            __kmp_dephash_free_entries(thread, taskdata->td_dephash);
+          }
+        }
+      }
+      return;
+    }
+    // Predecrement simulated by "- 1" calculation
+    children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
+    KMP_DEBUG_ASSERT(children >= 0);
+  }
+
+  KA_TRACE(
+      20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
+           "not freeing it yet\n",
+           gtid, taskdata, children));
+}
+
+// __kmp_task_finish: bookkeeping to do when a task finishes execution
+//
+// gtid: global thread ID for calling thread
+// task: task to be finished
+// resumed_task: task to be resumed.  (may be NULL if task is serialized)
+template <bool ompt>
+static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
+                              kmp_taskdata_t *resumed_task) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_task_team_t *task_team =
+      thread->th.th_task_team; // might be NULL for serial teams...
+  kmp_int32 children = 0;
+
+  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
+                "task %p\n",
+                gtid, taskdata, resumed_task));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+
+// Pop task from stack if tied
+#ifdef BUILD_TIED_TASK_STACK
+  if (taskdata->td_flags.tiedness == TASK_TIED) {
+    __kmp_pop_task_stack(gtid, thread, taskdata);
+  }
+#endif /* BUILD_TIED_TASK_STACK */
+
+  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+    // untied task needs to check the counter so that the task structure is not
+    // freed prematurely
+    kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
+    KA_TRACE(
+        20,
+        ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
+         gtid, counter, taskdata));
+    if (counter > 0) {
+      // untied task is not done, to be continued possibly by other thread, do
+      // not free it now
+      if (resumed_task == NULL) {
+        KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
+        resumed_task = taskdata->td_parent; // In a serialized task, the resumed
+        // task is the parent
+      }
+      thread->th.th_current_task = resumed_task; // restore current_task
+      resumed_task->td_flags.executing = 1; // resume previous task
+      KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
+                    "resuming task %p\n",
+                    gtid, taskdata, resumed_task));
+      return;
+    }
+  }
+#if OMPT_SUPPORT
+  if (ompt)
+    __ompt_task_finish(task, resumed_task);
+#endif
+
+  // Check mutexinoutset dependencies, release locks
+  kmp_depnode_t *node = taskdata->td_depnode;
+  if (node && (node->dn.mtx_num_locks < 0)) {
+    // negative num_locks means all locks were acquired
+    node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
+    for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
+      KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
+      __kmp_release_lock(node->dn.mtx_locks[i], gtid);
+    }
+  }
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  bool detach = false;
+  if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
+    if (taskdata->td_allow_completion_event.type ==
+        KMP_EVENT_ALLOW_COMPLETION) {
+      // event hasn't been fulfilled yet. Try to detach task.
+      __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
+      if (taskdata->td_allow_completion_event.type ==
+          KMP_EVENT_ALLOW_COMPLETION) {
+        taskdata->td_flags.proxy = TASK_PROXY; // proxify!
+        detach = true;
+      }
+      __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
+    }
+  }
+  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  if (!detach) {
+    taskdata->td_flags.complete = 1; // mark the task as completed
+
+    // Only need to keep track of count if team parallel and tasking not
+    // serialized
+    if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+      // Predecrement simulated by "- 1" calculation
+      children =
+          KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+      KMP_DEBUG_ASSERT(children >= 0);
+      if (taskdata->td_taskgroup)
+        KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+      __kmp_release_deps(gtid, taskdata);
+    } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
+      // if we found proxy tasks there could exist a dependency chain
+      // with the proxy task as origin
+      __kmp_release_deps(gtid, taskdata);
+    }
+  }
+
+  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
+  // called. Othertwise, if a task is executed immediately from the release_deps
+  // code, the flag will be reset to 1 again by this same function
+  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
+  taskdata->td_flags.executing = 0; // suspend the finishing task
+
+  KA_TRACE(
+      20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
+           gtid, taskdata, children));
+
+  /* If the tasks' destructor thunk flag has been set, we need to invoke the
+     destructor thunk that has been generated by the compiler. The code is
+     placed here, since at this point other tasks might have been released
+     hence overlapping the destructor invokations with some other work in the
+     released tasks.  The OpenMP spec is not specific on when the destructors
+     are invoked, so we should be free to choose. */
+  if (taskdata->td_flags.destructors_thunk) {
+    kmp_routine_entry_t destr_thunk = task->data1.destructors;
+    KMP_ASSERT(destr_thunk);
+    destr_thunk(gtid, task);
+  }
+
+  // bookkeeping for resuming task:
+  // GEH - note tasking_ser => task_serial
+  KMP_DEBUG_ASSERT(
+      (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
+      taskdata->td_flags.task_serial);
+  if (taskdata->td_flags.task_serial) {
+    if (resumed_task == NULL) {
+      resumed_task = taskdata->td_parent; // In a serialized task, the resumed
+      // task is the parent
+    }
+  } else {
+    KMP_DEBUG_ASSERT(resumed_task !=
+                     NULL); // verify that resumed task is passed as arguemnt
+  }
+
+  // Free this task and then ancestor tasks if they have no children.
+  // Restore th_current_task first as suggested by John:
+  // johnmc: if an asynchronous inquiry peers into the runtime system
+  // it doesn't see the freed task as the current task.
+  thread->th.th_current_task = resumed_task;
+  if (!detach)
+    __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+
+  // TODO: GEH - make sure root team implicit task is initialized properly.
+  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
+  resumed_task->td_flags.executing = 1; // resume previous task
+
+  KA_TRACE(
+      10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
+           gtid, taskdata, resumed_task));
+
+  return;
+}
+
+template <bool ompt>
+static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
+                                                  kmp_int32 gtid,
+                                                  kmp_task_t *task) {
+  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
+                gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
+  // this routine will provide task to resume
+  __kmp_task_finish<ompt>(gtid, task, NULL);
+
+  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
+                gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
+
+#if OMPT_SUPPORT
+  if (ompt) {
+    ompt_frame_t *ompt_frame;
+    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
+    ompt_frame->enter_frame = ompt_data_none;
+    ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
+  }
+#endif
+
+  return;
+}
+
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                       kmp_task_t *task) {
+  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
+}
+#endif // OMPT_SUPPORT
+
+// __kmpc_omp_task_complete_if0: report that a task has completed execution
+//
+// loc_ref: source location information; points to end of task block.
+// gtid: global thread number.
+// task: task thunk for the completed task.
+void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *task) {
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
+    return;
+  }
+#endif
+  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
+}
+
+#ifdef TASK_UNUSED
+// __kmpc_omp_task_complete: report that a task has completed execution
+// NEVER GENERATED BY COMPILER, DEPRECATED!!!
+void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
+                              kmp_task_t *task) {
+  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, KMP_TASK_TO_TASKDATA(task)));
+
+  __kmp_task_finish<false>(gtid, task,
+                           NULL); // Not sure how to find task to resume
+
+  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, KMP_TASK_TO_TASKDATA(task)));
+  return;
+}
+#endif // TASK_UNUSED
+
+// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
+// task for a given thread
+//
+// loc_ref:  reference to source location of parallel region
+// this_thr:  thread data structure corresponding to implicit task
+// team: team for this_thr
+// tid: thread id of given thread within team
+// set_curr_task: TRUE if need to push current task to thread
+// NOTE: Routine does not set up the implicit task ICVS.  This is assumed to
+// have already been done elsewhere.
+// TODO: Get better loc_ref.  Value passed in may be NULL
+void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
+                              kmp_team_t *team, int tid, int set_curr_task) {
+  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
+
+  KF_TRACE(
+      10,
+      ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
+       tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
+
+  task->td_task_id = KMP_GEN_TASK_ID();
+  task->td_team = team;
+  //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info
+  //    in debugger)
+  task->td_ident = loc_ref;
+  task->td_taskwait_ident = NULL;
+  task->td_taskwait_counter = 0;
+  task->td_taskwait_thread = 0;
+
+  task->td_flags.tiedness = TASK_TIED;
+  task->td_flags.tasktype = TASK_IMPLICIT;
+  task->td_flags.proxy = TASK_FULL;
+
+  // All implicit tasks are executed immediately, not deferred
+  task->td_flags.task_serial = 1;
+  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
+  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
+
+  task->td_flags.started = 1;
+  task->td_flags.executing = 1;
+  task->td_flags.complete = 0;
+  task->td_flags.freed = 0;
+
+  task->td_depnode = NULL;
+  task->td_last_tied = task;
+  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
+
+  if (set_curr_task) { // only do this init first time thread is created
+    KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
+    // Not used: don't need to deallocate implicit task
+    KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
+    task->td_taskgroup = NULL; // An implicit task does not have taskgroup
+    task->td_dephash = NULL;
+    __kmp_push_current_task_to_thread(this_thr, team, tid);
+  } else {
+    KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
+    KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
+  }
+
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled))
+    __ompt_task_init(task, tid);
+#endif
+
+  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
+                team, task));
+}
+
+// __kmp_finish_implicit_task: Release resources associated to implicit tasks
+// at the end of parallel regions. Some resources are kept for reuse in the next
+// parallel region.
+//
+// thread:  thread data structure corresponding to implicit task
+void __kmp_finish_implicit_task(kmp_info_t *thread) {
+  kmp_taskdata_t *task = thread->th.th_current_task;
+  if (task->td_dephash) {
+    int children;
+    task->td_flags.complete = 1;
+    children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
+    kmp_tasking_flags_t flags_old = task->td_flags;
+    if (children == 0 && flags_old.complete == 1) {
+      kmp_tasking_flags_t flags_new = flags_old;
+      flags_new.complete = 0;
+      if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
+                                      *RCAST(kmp_int32 *, &flags_old),
+                                      *RCAST(kmp_int32 *, &flags_new))) {
+        KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
+                       "dephash of implicit task %p\n",
+                       thread->th.th_info.ds.ds_gtid, task));
+        __kmp_dephash_free_entries(thread, task->td_dephash);
+      }
+    }
+  }
+}
+
+// __kmp_free_implicit_task: Release resources associated to implicit tasks
+// when these are destroyed regions
+//
+// thread:  thread data structure corresponding to implicit task
+void __kmp_free_implicit_task(kmp_info_t *thread) {
+  kmp_taskdata_t *task = thread->th.th_current_task;
+  if (task && task->td_dephash) {
+    __kmp_dephash_free(thread, task->td_dephash);
+    task->td_dephash = NULL;
+  }
+}
+
+// Round up a size to a power of two specified by val: Used to insert padding
+// between structures co-allocated using a single malloc() call
+static size_t __kmp_round_up_to_val(size_t size, size_t val) {
+  if (size & (val - 1)) {
+    size &= ~(val - 1);
+    if (size <= KMP_SIZE_T_MAX - val) {
+      size += val; // Round up if there is no overflow.
+    }
+  }
+  return size;
+} // __kmp_round_up_to_va
+
+// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
+//
+// loc_ref: source location information
+// gtid: global thread number.
+// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
+// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
+// sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including
+// private vars accessed in task.
+// sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed
+// in task.
+// task_entry: Pointer to task code entry point generated by compiler.
+// returns: a pointer to the allocated kmp_task_t structure (task).
+kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                             kmp_tasking_flags_t *flags,
+                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                             kmp_routine_entry_t task_entry) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_team_t *team = thread->th.th_team;
+  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+  size_t shareds_offset;
+
+  if (!TCR_4(__kmp_init_middle))
+    __kmp_middle_initialize();
+
+  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
+                "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
+                gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
+                sizeof_shareds, task_entry));
+
+  if (parent_task->td_flags.final) {
+    if (flags->merged_if0) {
+    }
+    flags->final = 1;
+  }
+  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
+    // Untied task encountered causes the TSC algorithm to check entire deque of
+    // the victim thread. If no untied task encountered, then checking the head
+    // of the deque should be enough.
+    KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
+  }
+
+  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
+  // the tasking setup
+  // when that happens is too late.
+  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
+    if (flags->proxy == TASK_PROXY) {
+      flags->tiedness = TASK_UNTIED;
+      flags->merged_if0 = 1;
+    }
+    /* are we running in a sequential parallel or tskm_immediate_exec... we need
+       tasking support enabled */
+    if ((thread->th.th_task_team) == NULL) {
+      /* This should only happen if the team is serialized
+          setup a task team and propagate it to the thread */
+      KMP_DEBUG_ASSERT(team->t.t_serialized);
+      KA_TRACE(30,
+               ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
+                gtid));
+      __kmp_task_team_setup(
+          thread, team,
+          1); // 1 indicates setup the current team regardless of nthreads
+      thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
+    }
+    kmp_task_team_t *task_team = thread->th.th_task_team;
+
+    /* tasking must be enabled now as the task might not be pushed */
+    if (!KMP_TASKING_ENABLED(task_team)) {
+      KA_TRACE(
+          30,
+          ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
+      __kmp_enable_tasking(task_team, thread);
+      kmp_int32 tid = thread->th.th_info.ds.ds_tid;
+      kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
+      // No lock needed since only owner can allocate
+      if (thread_data->td.td_deque == NULL) {
+        __kmp_alloc_task_deque(thread, thread_data);
+      }
+    }
+
+    if (task_team->tt.tt_found_proxy_tasks == FALSE)
+      TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
+  }
+
+  // Calculate shared structure offset including padding after kmp_task_t struct
+  // to align pointers in shared struct
+  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
+  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
+
+  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
+                shareds_offset));
+  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
+                sizeof_shareds));
+
+// Avoid double allocation here by combining shareds with taskdata
+#if USE_FAST_MEMORY
+  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
+                                                               sizeof_shareds);
+#else /* ! USE_FAST_MEMORY */
+  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
+                                                               sizeof_shareds);
+#endif /* USE_FAST_MEMORY */
+  ANNOTATE_HAPPENS_AFTER(taskdata);
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+
+// Make sure task & taskdata are aligned appropriately
+#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
+#else
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
+  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
+#endif
+  if (sizeof_shareds > 0) {
+    // Avoid double allocation here by combining shareds with taskdata
+    task->shareds = &((char *)taskdata)[shareds_offset];
+    // Make sure shareds struct is aligned to pointer size
+    KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
+                     0);
+  } else {
+    task->shareds = NULL;
+  }
+  task->routine = task_entry;
+  task->part_id = 0; // AC: Always start with 0 part id
+
+  taskdata->td_task_id = KMP_GEN_TASK_ID();
+  taskdata->td_team = team;
+  taskdata->td_alloc_thread = thread;
+  taskdata->td_parent = parent_task;
+  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
+  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
+  taskdata->td_ident = loc_ref;
+  taskdata->td_taskwait_ident = NULL;
+  taskdata->td_taskwait_counter = 0;
+  taskdata->td_taskwait_thread = 0;
+  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
+  // avoid copying icvs for proxy tasks
+  if (flags->proxy == TASK_FULL)
+    copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
+
+  taskdata->td_flags.tiedness = flags->tiedness;
+  taskdata->td_flags.final = flags->final;
+  taskdata->td_flags.merged_if0 = flags->merged_if0;
+  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
+  taskdata->td_flags.proxy = flags->proxy;
+  taskdata->td_flags.detachable = flags->detachable;
+  taskdata->td_task_team = thread->th.th_task_team;
+  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
+  taskdata->td_flags.tasktype = TASK_EXPLICIT;
+
+  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
+  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
+
+  // GEH - TODO: fix this to copy parent task's value of team_serial flag
+  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
+
+  // GEH - Note we serialize the task if the team is serialized to make sure
+  // implicit parallel region tasks are not left until program termination to
+  // execute. Also, it helps locality to execute immediately.
+
+  taskdata->td_flags.task_serial =
+      (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
+       taskdata->td_flags.tasking_ser);
+
+  taskdata->td_flags.started = 0;
+  taskdata->td_flags.executing = 0;
+  taskdata->td_flags.complete = 0;
+  taskdata->td_flags.freed = 0;
+
+  taskdata->td_flags.native = flags->native;
+
+  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+  // start at one because counts current task and children
+  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
+  taskdata->td_taskgroup =
+      parent_task->td_taskgroup; // task inherits taskgroup from the parent task
+  taskdata->td_dephash = NULL;
+  taskdata->td_depnode = NULL;
+  if (flags->tiedness == TASK_UNTIED)
+    taskdata->td_last_tied = NULL; // will be set when the task is scheduled
+  else
+    taskdata->td_last_tied = taskdata;
+  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled))
+    __ompt_task_init(taskdata, gtid);
+#endif
+// Only need to keep track of child task counts if team parallel and tasking not
+// serialized or if it is a proxy or detachable task
+  if (flags->proxy == TASK_PROXY ||
+      flags->detachable == TASK_DETACHABLE ||
+      !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
+  {
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
+    if (parent_task->td_taskgroup)
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
+    // Only need to keep track of allocated child tasks for explicit tasks since
+    // implicit not deallocated
+    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
+    }
+  }
+
+  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
+                gtid, taskdata, taskdata->td_parent));
+  ANNOTATE_HAPPENS_BEFORE(task);
+
+  return task;
+}
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_int32 flags, size_t sizeof_kmp_task_t,
+                                  size_t sizeof_shareds,
+                                  kmp_routine_entry_t task_entry) {
+  kmp_task_t *retval;
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+
+  input_flags->native = FALSE;
+// __kmp_task_alloc() sets up all other runtime flags
+
+  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
+                "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
+                gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
+                input_flags->proxy ? "proxy" : "",
+                input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
+                sizeof_shareds, task_entry));
+
+  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
+                            sizeof_shareds, task_entry);
+
+  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
+
+  return retval;
+}
+
+kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                         kmp_int32 flags,
+                                         size_t sizeof_kmp_task_t,
+                                         size_t sizeof_shareds,
+                                         kmp_routine_entry_t task_entry,
+                                         kmp_int64 device_id) {
+  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
+                               sizeof_shareds, task_entry);
+}
+
+/*!
+@ingroup TASKING
+@param loc_ref location of the original task directive
+@param gtid Global Thread ID of encountering thread
+@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
+task''
+@param naffins Number of affinity items
+@param affin_list List of affinity items
+@return Returns non-zero if registering affinity information was not successful.
+ Returns 0 if registration was successful
+This entry registers the affinity information attached to a task with the task
+thunk structure kmp_taskdata_t.
+*/
+kmp_int32
+__kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *new_task, kmp_int32 naffins,
+                                  kmp_task_affinity_info_t *affin_list) {
+  return 0;
+}
+
+//  __kmp_invoke_task: invoke the specified task
+//
+// gtid: global thread ID of caller
+// task: the task to invoke
+// current_task: the task to resume after task invokation
+static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
+                              kmp_taskdata_t *current_task) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_info_t *thread;
+  int discard = 0 /* false */;
+  KA_TRACE(
+      30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
+           gtid, taskdata, current_task));
+  KMP_DEBUG_ASSERT(task);
+  if (taskdata->td_flags.proxy == TASK_PROXY &&
+      taskdata->td_flags.complete == 1) {
+    // This is a proxy task that was already completed but it needs to run
+    // its bottom-half finish
+    KA_TRACE(
+        30,
+        ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
+         gtid, taskdata));
+
+    __kmp_bottom_half_finish_proxy(gtid, task);
+
+    KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
+                  "proxy task %p, resuming task %p\n",
+                  gtid, taskdata, current_task));
+
+    return;
+  }
+
+#if OMPT_SUPPORT
+  // For untied tasks, the first task executed only calls __kmpc_omp_task and
+  // does not execute code.
+  ompt_thread_info_t oldInfo;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    // Store the threads states and restore them after the task
+    thread = __kmp_threads[gtid];
+    oldInfo = thread->th.ompt_thread_info;
+    thread->th.ompt_thread_info.wait_id = 0;
+    thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
+                                            ? ompt_state_work_serial
+                                            : ompt_state_work_parallel;
+    taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+  }
+#endif
+
+  // Proxy tasks are not handled by the runtime
+  if (taskdata->td_flags.proxy != TASK_PROXY) {
+    ANNOTATE_HAPPENS_AFTER(task);
+    __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
+  }
+
+  // TODO: cancel tasks if the parallel region has also been cancelled
+  // TODO: check if this sequence can be hoisted above __kmp_task_start
+  // if cancellation has been enabled for this run ...
+  if (__kmp_omp_cancellation) {
+    thread = __kmp_threads[gtid];
+    kmp_team_t *this_team = thread->th.th_team;
+    kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
+    if ((taskgroup && taskgroup->cancel_request) ||
+        (this_team->t.t_cancel_request == cancel_parallel)) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+      ompt_data_t *task_data;
+      if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
+        __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
+        ompt_callbacks.ompt_callback(ompt_callback_cancel)(
+            task_data,
+            ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
+                                                      : ompt_cancel_parallel) |
+                ompt_cancel_discarded_task,
+            NULL);
+      }
+#endif
+      KMP_COUNT_BLOCK(TASK_cancelled);
+      // this task belongs to a task group and we need to cancel it
+      discard = 1 /* true */;
+    }
+  }
+
+  // Invoke the task routine and pass in relevant data.
+  // Thunks generated by gcc take a different argument list.
+  if (!discard) {
+    if (taskdata->td_flags.tiedness == TASK_UNTIED) {
+      taskdata->td_last_tied = current_task->td_last_tied;
+      KMP_DEBUG_ASSERT(taskdata->td_last_tied);
+    }
+#if KMP_STATS_ENABLED
+    KMP_COUNT_BLOCK(TASK_executed);
+    switch (KMP_GET_THREAD_STATE()) {
+    case FORK_JOIN_BARRIER:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
+      break;
+    case PLAIN_BARRIER:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
+      break;
+    case TASKYIELD:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
+      break;
+    case TASKWAIT:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
+      break;
+    case TASKGROUP:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
+      break;
+    default:
+      KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
+      break;
+    }
+#endif // KMP_STATS_ENABLED
+
+// OMPT task begin
+#if OMPT_SUPPORT
+    if (UNLIKELY(ompt_enabled.enabled))
+      __ompt_task_start(task, current_task, gtid);
+#endif
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    kmp_uint64 cur_time;
+    kmp_int32 kmp_itt_count_task =
+        __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
+        current_task->td_flags.tasktype == TASK_IMPLICIT;
+    if (kmp_itt_count_task) {
+      thread = __kmp_threads[gtid];
+      // Time outer level explicit task on barrier for adjusting imbalance time
+      if (thread->th.th_bar_arrive_time)
+        cur_time = __itt_get_timestamp();
+      else
+        kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
+    }
+#endif
+
+#ifdef KMP_GOMP_COMPAT
+    if (taskdata->td_flags.native) {
+      ((void (*)(void *))(*(task->routine)))(task->shareds);
+    } else
+#endif /* KMP_GOMP_COMPAT */
+    {
+      (*(task->routine))(gtid, task);
+    }
+    KMP_POP_PARTITIONED_TIMER();
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (kmp_itt_count_task) {
+      // Barrier imbalance - adjust arrive time with the task duration
+      thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
+    }
+#endif
+
+  }
+
+
+  // Proxy tasks are not handled by the runtime
+  if (taskdata->td_flags.proxy != TASK_PROXY) {
+    ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
+#if OMPT_SUPPORT
+    if (UNLIKELY(ompt_enabled.enabled)) {
+      thread->th.ompt_thread_info = oldInfo;
+      if (taskdata->td_flags.tiedness == TASK_TIED) {
+        taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
+      }
+      __kmp_task_finish<true>(gtid, task, current_task);
+    } else
+#endif
+      __kmp_task_finish<false>(gtid, task, current_task);
+  }
+
+  KA_TRACE(
+      30,
+      ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
+       gtid, taskdata, current_task));
+  return;
+}
+
+// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
+                                kmp_task_t *new_task) {
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+
+  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
+                loc_ref, new_taskdata));
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    parent = new_taskdata->td_parent;
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          parent ? &(parent->ompt_task_info.task_data) : &task_data,
+          parent ? &(parent->ompt_task_info.frame) : NULL,
+          &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
+          OMPT_GET_RETURN_ADDRESS(0));
+    }
+  }
+#endif
+
+  /* Should we execute the new task or queue it? For now, let's just always try
+     to queue it.  If the queue fills up, then we'll execute it.  */
+
+  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
+  { // Execute this task immediately
+    kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+    new_taskdata->td_flags.task_serial = 1;
+    __kmp_invoke_task(gtid, new_task, current_task);
+  }
+
+  KA_TRACE(
+      10,
+      ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
+       "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
+       gtid, loc_ref, new_taskdata));
+
+  ANNOTATE_HAPPENS_BEFORE(new_task);
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    parent->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+// __kmp_omp_task: Schedule a non-thread-switchable task for execution
+//
+// gtid: Global Thread ID of encountering thread
+// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
+// serialize_immediate: if TRUE then if the task is executed immediately its
+// execution will be serialized
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
+                         bool serialize_immediate) {
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+
+  /* Should we execute the new task or queue it? For now, let's just always try
+     to queue it.  If the queue fills up, then we'll execute it.  */
+  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
+      __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
+  { // Execute this task immediately
+    kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
+    if (serialize_immediate)
+      new_taskdata->td_flags.task_serial = 1;
+    __kmp_invoke_task(gtid, new_task, current_task);
+  }
+
+  ANNOTATE_HAPPENS_BEFORE(new_task);
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
+// non-thread-switchable task from the parent thread only!
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by
+// __kmp_omp_task_alloc()
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
+                          kmp_task_t *new_task) {
+  kmp_int32 res;
+  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
+
+#if KMP_DEBUG || OMPT_SUPPORT
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+#endif
+  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
+                new_taskdata));
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent = NULL;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    if (!new_taskdata->td_flags.started) {
+      OMPT_STORE_RETURN_ADDRESS(gtid);
+      parent = new_taskdata->td_parent;
+      if (!parent->ompt_task_info.frame.enter_frame.ptr) {
+        parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+      }
+      if (ompt_enabled.ompt_callback_task_create) {
+        ompt_data_t task_data = ompt_data_none;
+        ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+            parent ? &(parent->ompt_task_info.task_data) : &task_data,
+            parent ? &(parent->ompt_task_info.frame) : NULL,
+            &(new_taskdata->ompt_task_info.task_data),
+            ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+            OMPT_LOAD_RETURN_ADDRESS(gtid));
+      }
+    } else {
+      // We are scheduling the continuation of an UNTIED task.
+      // Scheduling back to the parent task.
+      __ompt_task_finish(new_task,
+                         new_taskdata->ompt_task_info.scheduling_parent,
+                         ompt_task_switch);
+      new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
+    }
+  }
+#endif
+
+  res = __kmp_omp_task(gtid, new_task, true);
+
+  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
+                "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
+                gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
+    parent->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return res;
+}
+
+// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
+// a taskloop task with the correct OMPT return address
+//
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by
+// __kmp_omp_task_alloc()
+// codeptr_ra: return address for OMPT callback
+// Returns:
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
+//    be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
+//    resumed later.
+kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_task_t *new_task, void *codeptr_ra) {
+  kmp_int32 res;
+  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
+
+#if KMP_DEBUG || OMPT_SUPPORT
+  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+#endif
+  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
+                new_taskdata));
+
+#if OMPT_SUPPORT
+  kmp_taskdata_t *parent = NULL;
+  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
+    parent = new_taskdata->td_parent;
+    if (!parent->ompt_task_info.frame.enter_frame.ptr)
+      parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
+    if (ompt_enabled.ompt_callback_task_create) {
+      ompt_data_t task_data = ompt_data_none;
+      ompt_callbacks.ompt_callback(ompt_callback_task_create)(
+          parent ? &(parent->ompt_task_info.task_data) : &task_data,
+          parent ? &(parent->ompt_task_info.frame) : NULL,
+          &(new_taskdata->ompt_task_info.task_data),
+          ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+          codeptr_ra);
+    }
+  }
+#endif
+
+  res = __kmp_omp_task(gtid, new_task, true);
+
+  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
+                "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
+                gtid, loc_ref, new_taskdata));
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
+    parent->ompt_task_info.frame.enter_frame = ompt_data_none;
+  }
+#endif
+  return res;
+}
+
+template <bool ompt>
+static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
+                                              void *frame_address,
+                                              void *return_address) {
+  kmp_taskdata_t *taskdata;
+  kmp_info_t *thread;
+  int thread_finished = FALSE;
+  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
+
+  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    thread = __kmp_threads[gtid];
+    taskdata = thread->th.th_current_task;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    ompt_data_t *my_task_data;
+    ompt_data_t *my_parallel_data;
+
+    if (ompt) {
+      my_task_data = &(taskdata->ompt_task_info.task_data);
+      my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
+
+      taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
+
+      if (ompt_enabled.ompt_callback_sync_region) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
+            my_task_data, return_address);
+      }
+
+      if (ompt_enabled.ompt_callback_sync_region_wait) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
+            my_task_data, return_address);
+      }
+    }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+// Debugger: The taskwait is active. Store location and thread encountered the
+// taskwait.
+#if USE_ITT_BUILD
+// Note: These values are used by ITT events as well.
+#endif /* USE_ITT_BUILD */
+    taskdata->td_taskwait_counter += 1;
+    taskdata->td_taskwait_ident = loc_ref;
+    taskdata->td_taskwait_thread = gtid + 1;
+
+#if USE_ITT_BUILD
+    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
+    if (itt_sync_obj != NULL)
+      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+    bool must_wait =
+        !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
+
+    must_wait = must_wait || (thread->th.th_task_team != NULL &&
+                              thread->th.th_task_team->tt.tt_found_proxy_tasks);
+    if (must_wait) {
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
+                             &(taskdata->td_incomplete_child_tasks)),
+                       0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
+        flag.execute_tasks(thread, gtid, FALSE,
+                           &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
+                           __kmp_task_stealing_constraint);
+      }
+    }
+#if USE_ITT_BUILD
+    if (itt_sync_obj != NULL)
+      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+    // Debugger:  The taskwait is completed. Location remains, but thread is
+    // negated.
+    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (ompt) {
+      if (ompt_enabled.ompt_callback_sync_region_wait) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
+            my_task_data, return_address);
+      }
+      if (ompt_enabled.ompt_callback_sync_region) {
+        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
+            my_task_data, return_address);
+      }
+      taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
+    }
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+    ANNOTATE_HAPPENS_AFTER(taskdata);
+  }
+
+  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
+                "returning TASK_CURRENT_NOT_QUEUED\n",
+                gtid, taskdata));
+
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+OMPT_NOINLINE
+static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
+                                          void *frame_address,
+                                          void *return_address) {
+  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
+                                            return_address);
+}
+#endif // OMPT_SUPPORT && OMPT_OPTIONAL
+
+// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
+// complete
+kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+    return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
+                                    OMPT_LOAD_RETURN_ADDRESS(gtid));
+  }
+#endif
+  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
+}
+
+// __kmpc_omp_taskyield: switch to a different task
+kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
+  kmp_taskdata_t *taskdata;
+  kmp_info_t *thread;
+  int thread_finished = FALSE;
+
+  KMP_COUNT_BLOCK(OMP_TASKYIELD);
+  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
+
+  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
+                gtid, loc_ref, end_part));
+
+  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
+    thread = __kmp_threads[gtid];
+    taskdata = thread->th.th_current_task;
+// Should we model this as a task wait or not?
+// Debugger: The taskwait is active. Store location and thread encountered the
+// taskwait.
+#if USE_ITT_BUILD
+// Note: These values are used by ITT events as well.
+#endif /* USE_ITT_BUILD */
+    taskdata->td_taskwait_counter += 1;
+    taskdata->td_taskwait_ident = loc_ref;
+    taskdata->td_taskwait_thread = gtid + 1;
+
+#if USE_ITT_BUILD
+    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
+    if (itt_sync_obj != NULL)
+      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+    if (!taskdata->td_flags.team_serial) {
+      kmp_task_team_t *task_team = thread->th.th_task_team;
+      if (task_team != NULL) {
+        if (KMP_TASKING_ENABLED(task_team)) {
+#if OMPT_SUPPORT
+          if (UNLIKELY(ompt_enabled.enabled))
+            thread->th.ompt_thread_info.ompt_task_yielded = 1;
+#endif
+          __kmp_execute_tasks_32(
+              thread, gtid, NULL, FALSE,
+              &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
+              __kmp_task_stealing_constraint);
+#if OMPT_SUPPORT
+          if (UNLIKELY(ompt_enabled.enabled))
+            thread->th.ompt_thread_info.ompt_task_yielded = 0;
+#endif
+        }
+      }
+    }
+#if USE_ITT_BUILD
+    if (itt_sync_obj != NULL)
+      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+    // Debugger:  The taskwait is completed. Location remains, but thread is
+    // negated.
+    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
+  }
+
+  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
+                "returning TASK_CURRENT_NOT_QUEUED\n",
+                gtid, taskdata));
+
+  return TASK_CURRENT_NOT_QUEUED;
+}
+
+// Task Reduction implementation
+//
+// Note: initial implementation didn't take into account the possibility
+// to specify omp_orig for initializer of the UDR (user defined reduction).
+// Corrected implementation takes into account the omp_orig object.
+// Compiler is free to use old implementation if omp_orig is not specified.
+
+/*!
+@ingroup BASIC_TYPES
+@{
+*/
+
+/*!
+Flags for special info per task reduction item.
+*/
+typedef struct kmp_taskred_flags {
+  /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
+  unsigned lazy_priv : 1;
+  unsigned reserved31 : 31;
+} kmp_taskred_flags_t;
+
+/*!
+Internal struct for reduction data item related info set up by compiler.
+*/
+typedef struct kmp_task_red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single parameter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+} kmp_task_red_input_t;
+
+/*!
+Internal struct for reduction data item related info saved by the library.
+*/
+typedef struct kmp_taskred_data {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+  void *reduce_priv; /**< array of thread specific items */
+  void *reduce_pend; /**< end of private data for faster comparison op */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_comb; /**< data combiner routine */
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_orig; /**< original item (can be used in UDR initializer) */
+} kmp_taskred_data_t;
+
+/*!
+Internal struct for reduction data item related info set up by compiler.
+
+New interface: added reduce_orig field to provide omp_orig for UDR initializer.
+*/
+typedef struct kmp_taskred_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (two parameters) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
+} kmp_taskred_input_t;
+/*!
+@}
+*/
+
+template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
+template <>
+void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                             kmp_task_red_input_t &src) {
+  item.reduce_orig = NULL;
+}
+template <>
+void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                            kmp_taskred_input_t &src) {
+  if (src.reduce_orig != NULL) {
+    item.reduce_orig = src.reduce_orig;
+  } else {
+    item.reduce_orig = src.reduce_shar;
+  } // non-NULL reduce_orig means new interface used
+}
+
+template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
+template <>
+void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
+                                           int offset) {
+  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
+}
+template <>
+void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
+                                          int offset) {
+  ((void (*)(void *, void *))item.reduce_init)(
+      (char *)(item.reduce_priv) + offset, item.reduce_orig);
+}
+
+template <typename T>
+void *__kmp_task_reduction_init(int gtid, int num, T *data) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
+  kmp_int32 nth = thread->th.th_team_nproc;
+  kmp_taskred_data_t *arr;
+
+  // check input data just in case
+  KMP_ASSERT(tg != NULL);
+  KMP_ASSERT(data != NULL);
+  KMP_ASSERT(num > 0);
+  if (nth == 1) {
+    KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
+                  gtid, tg));
+    return (void *)tg;
+  }
+  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
+                gtid, tg, num));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thread, num * sizeof(kmp_taskred_data_t));
+  for (int i = 0; i < num; ++i) {
+    size_t size = data[i].reduce_size - 1;
+    // round the size up to cache line per thread-specific item
+    size += CACHE_LINE - size % CACHE_LINE;
+    KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
+    arr[i].reduce_shar = data[i].reduce_shar;
+    arr[i].reduce_size = size;
+    arr[i].flags = data[i].flags;
+    arr[i].reduce_comb = data[i].reduce_comb;
+    arr[i].reduce_init = data[i].reduce_init;
+    arr[i].reduce_fini = data[i].reduce_fini;
+    __kmp_assign_orig<T>(arr[i], data[i]);
+    if (!arr[i].flags.lazy_priv) {
+      // allocate cache-line aligned block and fill it with zeros
+      arr[i].reduce_priv = __kmp_allocate(nth * size);
+      arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
+      if (arr[i].reduce_init != NULL) {
+        // initialize all thread-specific items
+        for (int j = 0; j < nth; ++j) {
+          __kmp_call_init<T>(arr[i], j * size);
+        }
+      }
+    } else {
+      // only allocate space for pointers now,
+      // objects will be lazily allocated/initialized if/when requested
+      // note that __kmp_allocate zeroes the allocated memory
+      arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
+    }
+  }
+  tg->reduce_data = (void *)arr;
+  tg->reduce_num_data = num;
+  return (void *)tg;
+}
+
+/*!
+@ingroup TASKING
+@param gtid      Global thread ID
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
+  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param gtid      Global thread ID
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for the taskgroup.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_init(int gtid, int num, void *data) {
+  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
+}
+
+// Copy task reduction data (except for shared pointers).
+template <typename T>
+void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
+                                    kmp_taskgroup_t *tg, void *reduce_data) {
+  kmp_taskred_data_t *arr;
+  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
+                " from data %p\n",
+                thr, tg, reduce_data));
+  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
+      thr, num * sizeof(kmp_taskred_data_t));
+  // threads will share private copies, thunk routines, sizes, flags, etc.:
+  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
+  for (int i = 0; i < num; ++i) {
+    arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
+  }
+  tg->reduce_data = (void *)arr;
+  tg->reduce_num_data = num;
+}
+
+/*!
+@ingroup TASKING
+@param gtid    Global thread ID
+@param tskgrp  The taskgroup ID (optional)
+@param data    Shared location of the item
+@return The pointer to per-thread data
+
+Get thread-specific location of data item
+*/
+void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_int32 nth = thread->th.th_team_nproc;
+  if (nth == 1)
+    return data; // nothing to do
+
+  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
+  if (tg == NULL)
+    tg = thread->th.th_current_task->td_taskgroup;
+  KMP_ASSERT(tg != NULL);
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
+  kmp_int32 num = tg->reduce_num_data;
+  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
+
+  KMP_ASSERT(data != NULL);
+  while (tg != NULL) {
+    for (int i = 0; i < num; ++i) {
+      if (!arr[i].flags.lazy_priv) {
+        if (data == arr[i].reduce_shar ||
+            (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
+          return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
+      } else {
+        // check shared location first
+        void **p_priv = (void **)(arr[i].reduce_priv);
+        if (data == arr[i].reduce_shar)
+          goto found;
+        // check if we get some thread specific location as parameter
+        for (int j = 0; j < nth; ++j)
+          if (data == p_priv[j])
+            goto found;
+        continue; // not found, continue search
+      found:
+        if (p_priv[tid] == NULL) {
+          // allocate thread specific object lazily
+          p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
+          if (arr[i].reduce_init != NULL) {
+            if (arr[i].reduce_orig != NULL) { // new interface
+              ((void (*)(void *, void *))arr[i].reduce_init)(
+                  p_priv[tid], arr[i].reduce_orig);
+            } else { // old interface (single parameter)
+              ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
+            }
+          }
+        }
+        return p_priv[tid];
+      }
+    }
+    tg = tg->parent;
+    arr = (kmp_taskred_data_t *)(tg->reduce_data);
+    num = tg->reduce_num_data;
+  }
+  KMP_ASSERT2(0, "Unknown task reduction item");
+  return NULL; // ERROR, this line never executed
+}
+
+// Finalize task reduction.
+// Called from __kmpc_end_taskgroup()
+static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
+  kmp_int32 nth = th->th.th_team_nproc;
+  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
+  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
+  kmp_int32 num = tg->reduce_num_data;
+  for (int i = 0; i < num; ++i) {
+    void *sh_data = arr[i].reduce_shar;
+    void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
+    void (*f_comb)(void *, void *) =
+        (void (*)(void *, void *))(arr[i].reduce_comb);
+    if (!arr[i].flags.lazy_priv) {
+      void *pr_data = arr[i].reduce_priv;
+      size_t size = arr[i].reduce_size;
+      for (int j = 0; j < nth; ++j) {
+        void *priv_data = (char *)pr_data + j * size;
+        f_comb(sh_data, priv_data); // combine results
+        if (f_fini)
+          f_fini(priv_data); // finalize if needed
+      }
+    } else {
+      void **pr_data = (void **)(arr[i].reduce_priv);
+      for (int j = 0; j < nth; ++j) {
+        if (pr_data[j] != NULL) {
+          f_comb(sh_data, pr_data[j]); // combine results
+          if (f_fini)
+            f_fini(pr_data[j]); // finalize if needed
+          __kmp_free(pr_data[j]);
+        }
+      }
+    }
+    __kmp_free(arr[i].reduce_priv);
+  }
+  __kmp_thread_free(th, arr);
+  tg->reduce_data = NULL;
+  tg->reduce_num_data = 0;
+}
+
+// Cleanup task reduction data for parallel or worksharing,
+// do not touch task private data other threads still working with.
+// Called from __kmpc_end_taskgroup()
+static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
+  __kmp_thread_free(th, tg->reduce_data);
+  tg->reduce_data = NULL;
+  tg->reduce_num_data = 0;
+}
+
+template <typename T>
+void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                         int num, T *data) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  kmp_int32 nth = thr->th.th_team_nproc;
+  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
+  if (nth == 1) {
+    KA_TRACE(10,
+             ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
+              gtid, thr->th.th_current_task->td_taskgroup));
+    return (void *)thr->th.th_current_task->td_taskgroup;
+  }
+  kmp_team_t *team = thr->th.th_team;
+  void *reduce_data;
+  kmp_taskgroup_t *tg;
+  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
+  if (reduce_data == NULL &&
+      __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
+                                 (void *)1)) {
+    // single thread enters this block to initialize common reduction data
+    KMP_DEBUG_ASSERT(reduce_data == NULL);
+    // first initialize own data, then make a copy other threads can use
+    tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
+    reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
+    KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
+    // fini counters should be 0 at this point
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
+    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
+  } else {
+    while (
+        (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
+        (void *)1) { // wait for task reduction initialization
+      KMP_CPU_PAUSE();
+    }
+    KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
+    tg = thr->th.th_current_task->td_taskgroup;
+    __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
+  }
+  return tg;
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has single parameter - pointer to object to be initialized. That means
+the reduction either does not use omp_orig object, or the omp_orig is accessible
+without help of the runtime library.
+*/
+void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
+                                          int num, void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_task_red_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+@param num       Number of data items to reduce
+@param data      Array of data for reduction
+@return The taskgroup identifier
+
+Initialize task reduction for a parallel or worksharing.
+
+Note: this entry supposes the optional compiler-generated initializer routine
+has two parameters, pointer to object to be initialized and pointer to omp_orig
+*/
+void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
+                                   void *data) {
+  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
+                                            (kmp_taskred_input_t *)data);
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location info
+@param gtid      Global thread ID
+@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise
+
+Finalize task reduction for a parallel or worksharing.
+*/
+void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
+  __kmpc_end_taskgroup(loc, gtid);
+}
+
+// __kmpc_taskgroup: Start a new taskgroup
+void __kmpc_taskgroup(ident_t *loc, int gtid) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+  kmp_taskgroup_t *tg_new =
+      (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
+  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
+  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
+  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
+  tg_new->parent = taskdata->td_taskgroup;
+  tg_new->reduce_data = NULL;
+  tg_new->reduce_num_data = 0;
+  taskdata->td_taskgroup = tg_new;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
+    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+    kmp_team_t *team = thread->th.th_team;
+    ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
+    // FIXME: I think this is wrong for lwt!
+    ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
+
+    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+        ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
+        &(my_task_data), codeptr);
+  }
+#endif
+}
+
+// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
+//                       and its descendants are complete
+void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *taskdata = thread->th.th_current_task;
+  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
+  int thread_finished = FALSE;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  kmp_team_t *team;
+  ompt_data_t my_task_data;
+  ompt_data_t my_parallel_data;
+  void *codeptr;
+  if (UNLIKELY(ompt_enabled.enabled)) {
+    team = thread->th.th_team;
+    my_task_data = taskdata->ompt_task_info.task_data;
+    // FIXME: I think this is wrong for lwt!
+    my_parallel_data = team->t.ompt_team_info.parallel_data;
+    codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
+    if (!codeptr)
+      codeptr = OMPT_GET_RETURN_ADDRESS(0);
+  }
+#endif
+
+  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
+  KMP_DEBUG_ASSERT(taskgroup != NULL);
+  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
+
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    // mark task as waiting not on a barrier
+    taskdata->td_taskwait_counter += 1;
+    taskdata->td_taskwait_ident = loc;
+    taskdata->td_taskwait_thread = gtid + 1;
+#if USE_ITT_BUILD
+    // For ITT the taskgroup wait is similar to taskwait until we need to
+    // distinguish them
+    void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
+    if (itt_sync_obj != NULL)
+      __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
+          &(my_task_data), codeptr);
+    }
+#endif
+
+    if (!taskdata->td_flags.team_serial ||
+        (thread->th.th_task_team != NULL &&
+         thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
+                       0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
+        flag.execute_tasks(thread, gtid, FALSE,
+                           &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
+                           __kmp_task_stealing_constraint);
+      }
+    }
+    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
+          &(my_task_data), codeptr);
+    }
+#endif
+
+#if USE_ITT_BUILD
+    if (itt_sync_obj != NULL)
+      __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+  }
+  KMP_DEBUG_ASSERT(taskgroup->count == 0);
+
+  if (taskgroup->reduce_data != NULL) { // need to reduce?
+    int cnt;
+    void *reduce_data;
+    kmp_team_t *t = thread->th.th_team;
+    kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
+    // check if <priv> data of the first reduction variable shared for the team
+    void *priv0 = arr[0].reduce_priv;
+    if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
+        ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on parallel
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        // finalize task reduction:
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in the team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
+                   NULL &&
+               ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
+      // finishing task reduction on worksharing
+      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
+      if (cnt == thread->th.th_team_nproc - 1) {
+        // we are the last thread passing __kmpc_reduction_modifier_fini()
+        __kmp_task_reduction_fini(thread, taskgroup);
+        // cleanup fields in team structure:
+        // TODO: is relaxed store enough here (whole barrier should follow)?
+        __kmp_thread_free(thread, reduce_data);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
+        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
+      } else {
+        // we are not the last thread passing __kmpc_reduction_modifier_fini(),
+        // so do not finalize reduction, just clean own copy of the data
+        __kmp_task_reduction_clean(thread, taskgroup);
+      }
+    } else {
+      // finishing task reduction on taskgroup
+      __kmp_task_reduction_fini(thread, taskgroup);
+    }
+  }
+  // Restore parent taskgroup for the current task
+  taskdata->td_taskgroup = taskgroup->parent;
+  __kmp_thread_free(thread, taskgroup);
+
+  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
+                gtid, taskdata));
+  ANNOTATE_HAPPENS_AFTER(taskdata);
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
+    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+        ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
+        &(my_task_data), codeptr);
+  }
+#endif
+}
+
+// __kmp_remove_my_task: remove a task from my own deque
+static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
+                                        kmp_task_team_t *task_team,
+                                        kmp_int32 is_constrained) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_thread_data_t *thread_data;
+  kmp_uint32 tail;
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
+                   NULL); // Caller should check this condition
+
+  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
+
+  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
+                gtid, thread_data->td.td_deque_ntasks,
+                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+
+  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
+    KA_TRACE(10,
+             ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
+              "ntasks=%d head=%u tail=%u\n",
+              gtid, thread_data->td.td_deque_ntasks,
+              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+    return NULL;
+  }
+
+  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
+    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+    KA_TRACE(10,
+             ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
+              "ntasks=%d head=%u tail=%u\n",
+              gtid, thread_data->td.td_deque_ntasks,
+              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+    return NULL;
+  }
+
+  tail = (thread_data->td.td_deque_tail - 1) &
+         TASK_DEQUE_MASK(thread_data->td); // Wrap index.
+  taskdata = thread_data->td.td_deque[tail];
+
+  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
+                             thread->th.th_current_task)) {
+    // The TSC does not allow to steal victim task
+    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+    KA_TRACE(10,
+             ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
+              "ntasks=%d head=%u tail=%u\n",
+              gtid, thread_data->td.td_deque_ntasks,
+              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+    return NULL;
+  }
+
+  thread_data->td.td_deque_tail = tail;
+  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
+
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
+                "ntasks=%d head=%u tail=%u\n",
+                gtid, taskdata, thread_data->td.td_deque_ntasks,
+                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+  return task;
+}
+
+// __kmp_steal_task: remove a task from another thread's deque
+// Assume that calling thread has already checked existence of
+// task_team thread_data before calling this routine.
+static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
+                                    kmp_task_team_t *task_team,
+                                    std::atomic<kmp_int32> *unfinished_threads,
+                                    int *thread_finished,
+                                    kmp_int32 is_constrained) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_taskdata_t *current;
+  kmp_thread_data_t *victim_td, *threads_data;
+  kmp_int32 target;
+  kmp_int32 victim_tid;
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+
+  threads_data = task_team->tt.tt_threads_data;
+  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
+
+  victim_tid = victim_thr->th.th_info.ds.ds_tid;
+  victim_td = &threads_data[victim_tid];
+
+  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
+                "task_team=%p ntasks=%d head=%u tail=%u\n",
+                gtid, __kmp_gtid_from_thread(victim_thr), task_team,
+                victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
+                victim_td->td.td_deque_tail));
+
+  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
+    KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
+                  "task_team=%p ntasks=%d head=%u tail=%u\n",
+                  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
+                  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
+                  victim_td->td.td_deque_tail));
+    return NULL;
+  }
+
+  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
+
+  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
+  // Check again after we acquire the lock
+  if (ntasks == 0) {
+    __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+    KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
+                  "task_team=%p ntasks=%d head=%u tail=%u\n",
+                  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
+                  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+    return NULL;
+  }
+
+  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
+  current = __kmp_threads[gtid]->th.th_current_task;
+  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
+  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
+    // Bump head pointer and Wrap.
+    victim_td->td.td_deque_head =
+        (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
+  } else {
+    if (!task_team->tt.tt_untied_task_encountered) {
+      // The TSC does not allow to steal victim task
+      __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+      KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
+                    "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
+                    gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
+                    victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+      return NULL;
+    }
+    int i;
+    // walk through victim's deque trying to steal any task
+    target = victim_td->td.td_deque_head;
+    taskdata = NULL;
+    for (i = 1; i < ntasks; ++i) {
+      target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
+      taskdata = victim_td->td.td_deque[target];
+      if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
+        break; // found victim task
+      } else {
+        taskdata = NULL;
+      }
+    }
+    if (taskdata == NULL) {
+      // No appropriate candidate to steal found
+      __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+      KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
+                    "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
+                    gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
+                    victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+      return NULL;
+    }
+    int prev = target;
+    for (i = i + 1; i < ntasks; ++i) {
+      // shift remaining tasks in the deque left by 1
+      target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
+      victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
+      prev = target;
+    }
+    KMP_DEBUG_ASSERT(
+        victim_td->td.td_deque_tail ==
+        (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
+    victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
+  }
+  if (*thread_finished) {
+    // We need to un-mark this victim as a finished victim.  This must be done
+    // before releasing the lock, or else other threads (starting with the
+    // master victim) might be prematurely released from the barrier!!!
+    kmp_int32 count;
+
+    count = KMP_ATOMIC_INC(unfinished_threads);
+
+    KA_TRACE(
+        20,
+        ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
+         gtid, count + 1, task_team));
+
+    *thread_finished = FALSE;
+  }
+  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
+
+  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
+
+  KMP_COUNT_BLOCK(TASK_stolen);
+  KA_TRACE(10,
+           ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
+            "task_team=%p ntasks=%d head=%u tail=%u\n",
+            gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
+            ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+  return task;
+}
+
+// __kmp_execute_tasks_template: Choose and execute tasks until either the
+// condition is statisfied (return true) or there are none left (return false).
+//
+// final_spin is TRUE if this is the spin at the release barrier.
+// thread_finished indicates whether the thread is finished executing all
+// the tasks it has on its deque, and is at the release barrier.
+// spinner is the location on which to spin.
+// spinner == NULL means only execute a single task and return.
+// checker is the value to check to terminate the spin.
+template <class C>
+static inline int __kmp_execute_tasks_template(
+    kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  kmp_task_team_t *task_team = thread->th.th_task_team;
+  kmp_thread_data_t *threads_data;
+  kmp_task_t *task;
+  kmp_info_t *other_thread;
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  std::atomic<kmp_int32> *unfinished_threads;
+  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
+                      tid = thread->th.th_info.ds.ds_tid;
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
+
+  if (task_team == NULL || current_task == NULL)
+    return FALSE;
+
+  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
+                "*thread_finished=%d\n",
+                gtid, final_spin, *thread_finished));
+
+  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
+  KMP_DEBUG_ASSERT(threads_data != NULL);
+
+  nthreads = task_team->tt.tt_nproc;
+  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
+  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
+  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
+
+  while (1) { // Outer loop keeps trying to find tasks in case of single thread
+    // getting tasks from target constructs
+    while (1) { // Inner loop to find a task and execute it
+      task = NULL;
+      if (use_own_tasks) { // check on own queue first
+        task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
+      }
+      if ((task == NULL) && (nthreads > 1)) { // Steal a task
+        int asleep = 1;
+        use_own_tasks = 0;
+        // Try to steal from the last place I stole from successfully.
+        if (victim_tid == -2) { // haven't stolen anything yet
+          victim_tid = threads_data[tid].td.td_deque_last_stolen;
+          if (victim_tid !=
+              -1) // if we have a last stolen from victim, get the thread
+            other_thread = threads_data[victim_tid].td.td_thr;
+        }
+        if (victim_tid != -1) { // found last victim
+          asleep = 0;
+        } else if (!new_victim) { // no recent steals and we haven't already
+          // used a new victim; select a random thread
+          do { // Find a different thread to steal work from.
+            // Pick a random thread. Initial plan was to cycle through all the
+            // threads, and only return if we tried to steal from every thread,
+            // and failed.  Arch says that's not such a great idea.
+            victim_tid = __kmp_get_random(thread) % (nthreads - 1);
+            if (victim_tid >= tid) {
+              ++victim_tid; // Adjusts random distribution to exclude self
+            }
+            // Found a potential victim
+            other_thread = threads_data[victim_tid].td.td_thr;
+            // There is a slight chance that __kmp_enable_tasking() did not wake
+            // up all threads waiting at the barrier.  If victim is sleeping,
+            // then wake it up. Since we were going to pay the cache miss
+            // penalty for referencing another thread's kmp_info_t struct
+            // anyway,
+            // the check shouldn't cost too much performance at this point. In
+            // extra barrier mode, tasks do not sleep at the separate tasking
+            // barrier, so this isn't a problem.
+            asleep = 0;
+            if ((__kmp_tasking_mode == tskm_task_teams) &&
+                (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
+                (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
+                 NULL)) {
+              asleep = 1;
+              __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
+                                        other_thread->th.th_sleep_loc);
+              // A sleeping thread should not have any tasks on it's queue.
+              // There is a slight possibility that it resumes, steals a task
+              // from another thread, which spawns more tasks, all in the time
+              // that it takes this thread to check => don't write an assertion
+              // that the victim's queue is empty.  Try stealing from a
+              // different thread.
+            }
+          } while (asleep);
+        }
+
+        if (!asleep) {
+          // We have a victim to try to steal from
+          task = __kmp_steal_task(other_thread, gtid, task_team,
+                                  unfinished_threads, thread_finished,
+                                  is_constrained);
+        }
+        if (task != NULL) { // set last stolen to victim
+          if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
+            threads_data[tid].td.td_deque_last_stolen = victim_tid;
+            // The pre-refactored code did not try more than 1 successful new
+            // vicitm, unless the last one generated more local tasks;
+            // new_victim keeps track of this
+            new_victim = 1;
+          }
+        } else { // No tasks found; unset last_stolen
+          KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
+          victim_tid = -2; // no successful victim found
+        }
+      }
+
+      if (task == NULL) // break out of tasking loop
+        break;
+
+// Found a task; execute it
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+      if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+        if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
+          // get the object reliably
+          itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+        }
+        __kmp_itt_task_starting(itt_sync_obj);
+      }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+      __kmp_invoke_task(gtid, task, current_task);
+#if USE_ITT_BUILD
+      if (itt_sync_obj != NULL)
+        __kmp_itt_task_finished(itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+      // If this thread is only partway through the barrier and the condition is
+      // met, then return now, so that the barrier gather/release pattern can
+      // proceed. If this thread is in the last spin loop in the barrier,
+      // waiting to be released, we know that the termination condition will not
+      // be satisified, so don't waste any cycles checking it.
+      if (flag == NULL || (!final_spin && flag->done_check())) {
+        KA_TRACE(
+            15,
+            ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
+             gtid));
+        return TRUE;
+      }
+      if (thread->th.th_task_team == NULL) {
+        break;
+      }
+      KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
+      // If execution of a stolen task results in more tasks being placed on our
+      // run queue, reset use_own_tasks
+      if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
+        KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
+                      "other tasks, restart\n",
+                      gtid));
+        use_own_tasks = 1;
+        new_victim = 0;
+      }
+    }
+
+    // The task source has been exhausted. If in final spin loop of barrier,
+    // check if termination condition is satisfied. The work queue may be empty
+    // but there might be proxy tasks still executing.
+    if (final_spin &&
+        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
+      // First, decrement the #unfinished threads, if that has not already been
+      // done.  This decrement might be to the spin location, and result in the
+      // termination condition being satisfied.
+      if (!*thread_finished) {
+        kmp_int32 count;
+
+        count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
+        KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
+                      "unfinished_threads to %d task_team=%p\n",
+                      gtid, count, task_team));
+        *thread_finished = TRUE;
+      }
+
+      // It is now unsafe to reference thread->th.th_team !!!
+      // Decrementing task_team->tt.tt_unfinished_threads can allow the master
+      // thread to pass through the barrier, where it might reset each thread's
+      // th.th_team field for the next parallel region. If we can steal more
+      // work, we know that this has not happened yet.
+      if (flag != NULL && flag->done_check()) {
+        KA_TRACE(
+            15,
+            ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
+             gtid));
+        return TRUE;
+      }
+    }
+
+    // If this thread's task team is NULL, master has recognized that there are
+    // no more tasks; bail out
+    if (thread->th.th_task_team == NULL) {
+      KA_TRACE(15,
+               ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
+      return FALSE;
+    }
+
+    // We could be getting tasks from target constructs; if this is the only
+    // thread, keep trying to execute tasks from own queue
+    if (nthreads == 1)
+      use_own_tasks = 1;
+    else {
+      KA_TRACE(15,
+               ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
+      return FALSE;
+    }
+  }
+}
+
+int __kmp_execute_tasks_32(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+int __kmp_execute_tasks_64(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+int __kmp_execute_tasks_oncore(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
+// next barrier so they can assist in executing enqueued tasks.
+// First thread in allocates the task team atomically.
+static void __kmp_enable_tasking(kmp_task_team_t *task_team,
+                                 kmp_info_t *this_thr) {
+  kmp_thread_data_t *threads_data;
+  int nthreads, i, is_init_thread;
+
+  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
+                __kmp_gtid_from_thread(this_thr)));
+
+  KMP_DEBUG_ASSERT(task_team != NULL);
+  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
+
+  nthreads = task_team->tt.tt_nproc;
+  KMP_DEBUG_ASSERT(nthreads > 0);
+  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
+
+  // Allocate or increase the size of threads_data if necessary
+  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
+
+  if (!is_init_thread) {
+    // Some other thread already set up the array.
+    KA_TRACE(
+        20,
+        ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
+         __kmp_gtid_from_thread(this_thr)));
+    return;
+  }
+  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
+  KMP_DEBUG_ASSERT(threads_data != NULL);
+
+  if (__kmp_tasking_mode == tskm_task_teams &&
+      (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
+    // Release any threads sleeping at the barrier, so that they can steal
+    // tasks and execute them.  In extra barrier mode, tasks do not sleep
+    // at the separate tasking barrier, so this isn't a problem.
+    for (i = 0; i < nthreads; i++) {
+      volatile void *sleep_loc;
+      kmp_info_t *thread = threads_data[i].td.td_thr;
+
+      if (i == this_thr->th.th_info.ds.ds_tid) {
+        continue;
+      }
+      // Since we haven't locked the thread's suspend mutex lock at this
+      // point, there is a small window where a thread might be putting
+      // itself to sleep, but hasn't set the th_sleep_loc field yet.
+      // To work around this, __kmp_execute_tasks_template() periodically checks
+      // see if other threads are sleeping (using the same random mechanism that
+      // is used for task stealing) and awakens them if they are.
+      if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
+          NULL) {
+        KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
+                      __kmp_gtid_from_thread(this_thr),
+                      __kmp_gtid_from_thread(thread)));
+        __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+      } else {
+        KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
+                      __kmp_gtid_from_thread(this_thr),
+                      __kmp_gtid_from_thread(thread)));
+      }
+    }
+  }
+
+  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
+                __kmp_gtid_from_thread(this_thr)));
+}
+
+/* // TODO: Check the comment consistency
+ * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
+ * like a shadow of the kmp_team_t data struct, with a different lifetime.
+ * After a child * thread checks into a barrier and calls __kmp_release() from
+ * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
+ * longer assume that the kmp_team_t structure is intact (at any moment, the
+ * master thread may exit the barrier code and free the team data structure,
+ * and return the threads to the thread pool).
+ *
+ * This does not work with the the tasking code, as the thread is still
+ * expected to participate in the execution of any tasks that may have been
+ * spawned my a member of the team, and the thread still needs access to all
+ * to each thread in the team, so that it can steal work from it.
+ *
+ * Enter the existence of the kmp_task_team_t struct.  It employs a reference
+ * counting mechanims, and is allocated by the master thread before calling
+ * __kmp_<barrier_kind>_release, and then is release by the last thread to
+ * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
+ * of the kmp_task_team_t structs for consecutive barriers can overlap
+ * (and will, unless the master thread is the last thread to exit the barrier
+ * release phase, which is not typical). The existence of such a struct is
+ * useful outside the context of tasking.
+ *
+ * We currently use the existence of the threads array as an indicator that
+ * tasks were spawned since the last barrier.  If the structure is to be
+ * useful outside the context of tasking, then this will have to change, but
+ * not settting the field minimizes the performance impact of tasking on
+ * barriers, when no explicit tasks were spawned (pushed, actually).
+ */
+
+static kmp_task_team_t *__kmp_free_task_teams =
+    NULL; // Free list for task_team data structures
+// Lock for task team data structures
+kmp_bootstrap_lock_t __kmp_task_team_lock =
+    KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
+
+// __kmp_alloc_task_deque:
+// Allocates a task deque for a particular thread, and initialize the necessary
+// data structures relating to the deque.  This only happens once per thread
+// per task team since task teams are recycled. No lock is needed during
+// allocation since each thread allocates its own deque.
+static void __kmp_alloc_task_deque(kmp_info_t *thread,
+                                   kmp_thread_data_t *thread_data) {
+  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
+  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
+
+  // Initialize last stolen task field to "none"
+  thread_data->td.td_deque_last_stolen = -1;
+
+  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
+  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
+  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
+
+  KE_TRACE(
+      10,
+      ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
+       __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
+  // Allocate space for task deque, and zero the deque
+  // Cannot use __kmp_thread_calloc() because threads not around for
+  // kmp_reap_task_team( ).
+  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
+      INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
+  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
+}
+
+// __kmp_free_task_deque:
+// Deallocates a task deque for a particular thread. Happens at library
+// deallocation so don't need to reset all thread data fields.
+static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
+  if (thread_data->td.td_deque != NULL) {
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+    TCW_4(thread_data->td.td_deque_ntasks, 0);
+    __kmp_free(thread_data->td.td_deque);
+    thread_data->td.td_deque = NULL;
+    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+  }
+
+#ifdef BUILD_TIED_TASK_STACK
+  // GEH: Figure out what to do here for td_susp_tied_tasks
+  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
+    __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
+  }
+#endif // BUILD_TIED_TASK_STACK
+}
+
+// __kmp_realloc_task_threads_data:
+// Allocates a threads_data array for a task team, either by allocating an
+// initial array or enlarging an existing array.  Only the first thread to get
+// the lock allocs or enlarges the array and re-initializes the array eleemnts.
+// That thread returns "TRUE", the rest return "FALSE".
+// Assumes that the new array size is given by task_team -> tt.tt_nproc.
+// The current size is given by task_team -> tt.tt_max_threads.
+static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
+                                           kmp_task_team_t *task_team) {
+  kmp_thread_data_t **threads_data_p;
+  kmp_int32 nthreads, maxthreads;
+  int is_init_thread = FALSE;
+
+  if (TCR_4(task_team->tt.tt_found_tasks)) {
+    // Already reallocated and initialized.
+    return FALSE;
+  }
+
+  threads_data_p = &task_team->tt.tt_threads_data;
+  nthreads = task_team->tt.tt_nproc;
+  maxthreads = task_team->tt.tt_max_threads;
+
+  // All threads must lock when they encounter the first task of the implicit
+  // task region to make sure threads_data fields are (re)initialized before
+  // used.
+  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
+
+  if (!TCR_4(task_team->tt.tt_found_tasks)) {
+    // first thread to enable tasking
+    kmp_team_t *team = thread->th.th_team;
+    int i;
+
+    is_init_thread = TRUE;
+    if (maxthreads < nthreads) {
+
+      if (*threads_data_p != NULL) {
+        kmp_thread_data_t *old_data = *threads_data_p;
+        kmp_thread_data_t *new_data = NULL;
+
+        KE_TRACE(
+            10,
+            ("__kmp_realloc_task_threads_data: T#%d reallocating "
+             "threads data for task_team %p, new_size = %d, old_size = %d\n",
+             __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
+        // Reallocate threads_data to have more elements than current array
+        // Cannot use __kmp_thread_realloc() because threads not around for
+        // kmp_reap_task_team( ).  Note all new array entries are initialized
+        // to zero by __kmp_allocate().
+        new_data = (kmp_thread_data_t *)__kmp_allocate(
+            nthreads * sizeof(kmp_thread_data_t));
+        // copy old data to new data
+        KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
+                     (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
+
+#ifdef BUILD_TIED_TASK_STACK
+        // GEH: Figure out if this is the right thing to do
+        for (i = maxthreads; i < nthreads; i++) {
+          kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
+          __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
+        }
+#endif // BUILD_TIED_TASK_STACK
+        // Install the new data and free the old data
+        (*threads_data_p) = new_data;
+        __kmp_free(old_data);
+      } else {
+        KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
+                      "threads data for task_team %p, size = %d\n",
+                      __kmp_gtid_from_thread(thread), task_team, nthreads));
+        // Make the initial allocate for threads_data array, and zero entries
+        // Cannot use __kmp_thread_calloc() because threads not around for
+        // kmp_reap_task_team( ).
+        ANNOTATE_IGNORE_WRITES_BEGIN();
+        *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
+            nthreads * sizeof(kmp_thread_data_t));
+        ANNOTATE_IGNORE_WRITES_END();
+#ifdef BUILD_TIED_TASK_STACK
+        // GEH: Figure out if this is the right thing to do
+        for (i = 0; i < nthreads; i++) {
+          kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
+          __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
+        }
+#endif // BUILD_TIED_TASK_STACK
+      }
+      task_team->tt.tt_max_threads = nthreads;
+    } else {
+      // If array has (more than) enough elements, go ahead and use it
+      KMP_DEBUG_ASSERT(*threads_data_p != NULL);
+    }
+
+    // initialize threads_data pointers back to thread_info structures
+    for (i = 0; i < nthreads; i++) {
+      kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
+      thread_data->td.td_thr = team->t.t_threads[i];
+
+      if (thread_data->td.td_deque_last_stolen >= nthreads) {
+        // The last stolen field survives across teams / barrier, and the number
+        // of threads may have changed.  It's possible (likely?) that a new
+        // parallel region will exhibit the same behavior as previous region.
+        thread_data->td.td_deque_last_stolen = -1;
+      }
+    }
+
+    KMP_MB();
+    TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
+  }
+
+  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
+  return is_init_thread;
+}
+
+// __kmp_free_task_threads_data:
+// Deallocates a threads_data array for a task team, including any attached
+// tasking deques.  Only occurs at library shutdown.
+static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
+  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
+  if (task_team->tt.tt_threads_data != NULL) {
+    int i;
+    for (i = 0; i < task_team->tt.tt_max_threads; i++) {
+      __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
+    }
+    __kmp_free(task_team->tt.tt_threads_data);
+    task_team->tt.tt_threads_data = NULL;
+  }
+  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
+}
+
+// __kmp_allocate_task_team:
+// Allocates a task team associated with a specific team, taking it from
+// the global task team free list if possible.  Also initializes data
+// structures.
+static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
+                                                 kmp_team_t *team) {
+  kmp_task_team_t *task_team = NULL;
+  int nthreads;
+
+  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
+                (thread ? __kmp_gtid_from_thread(thread) : -1), team));
+
+  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
+    // Take a task team from the task team pool
+    __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
+    if (__kmp_free_task_teams != NULL) {
+      task_team = __kmp_free_task_teams;
+      TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
+      task_team->tt.tt_next = NULL;
+    }
+    __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
+  }
+
+  if (task_team == NULL) {
+    KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
+                  "task team for team %p\n",
+                  __kmp_gtid_from_thread(thread), team));
+    // Allocate a new task team if one is not available.
+    // Cannot use __kmp_thread_malloc() because threads not around for
+    // kmp_reap_task_team( ).
+    task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
+    __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
+    // AC: __kmp_allocate zeroes returned memory
+    // task_team -> tt.tt_threads_data = NULL;
+    // task_team -> tt.tt_max_threads = 0;
+    // task_team -> tt.tt_next = NULL;
+  }
+
+  TCW_4(task_team->tt.tt_found_tasks, FALSE);
+  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
+
+  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
+  TCW_4(task_team->tt.tt_active, TRUE);
+
+  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
+                "unfinished_threads init'd to %d\n",
+                (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
+                KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
+  return task_team;
+}
+
+// __kmp_free_task_team:
+// Frees the task team associated with a specific thread, and adds it
+// to the global task team free list.
+void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
+  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
+                thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
+
+  // Put task team back on free list
+  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
+
+  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
+  task_team->tt.tt_next = __kmp_free_task_teams;
+  TCW_PTR(__kmp_free_task_teams, task_team);
+
+  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
+}
+
+// __kmp_reap_task_teams:
+// Free all the task teams on the task team free list.
+// Should only be done during library shutdown.
+// Cannot do anything that needs a thread structure or gtid since they are
+// already gone.
+void __kmp_reap_task_teams(void) {
+  kmp_task_team_t *task_team;
+
+  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
+    // Free all task_teams on the free list
+    __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
+    while ((task_team = __kmp_free_task_teams) != NULL) {
+      __kmp_free_task_teams = task_team->tt.tt_next;
+      task_team->tt.tt_next = NULL;
+
+      // Free threads_data if necessary
+      if (task_team->tt.tt_threads_data != NULL) {
+        __kmp_free_task_threads_data(task_team);
+      }
+      __kmp_free(task_team);
+    }
+    __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
+  }
+}
+
+// __kmp_wait_to_unref_task_teams:
+// Some threads could still be in the fork barrier release code, possibly
+// trying to steal tasks.  Wait for each thread to unreference its task team.
+void __kmp_wait_to_unref_task_teams(void) {
+  kmp_info_t *thread;
+  kmp_uint32 spins;
+  int done;
+
+  KMP_INIT_YIELD(spins);
+
+  for (;;) {
+    done = TRUE;
+
+    // TODO: GEH - this may be is wrong because some sync would be necessary
+    // in case threads are added to the pool during the traversal. Need to
+    // verify that lock for thread pool is held when calling this routine.
+    for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
+         thread = thread->th.th_next_pool) {
+#if KMP_OS_WINDOWS
+      DWORD exit_val;
+#endif
+      if (TCR_PTR(thread->th.th_task_team) == NULL) {
+        KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
+                      __kmp_gtid_from_thread(thread)));
+        continue;
+      }
+#if KMP_OS_WINDOWS
+      // TODO: GEH - add this check for Linux* OS / OS X* as well?
+      if (!__kmp_is_thread_alive(thread, &exit_val)) {
+        thread->th.th_task_team = NULL;
+        continue;
+      }
+#endif
+
+      done = FALSE; // Because th_task_team pointer is not NULL for this thread
+
+      KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
+                    "unreference task_team\n",
+                    __kmp_gtid_from_thread(thread)));
+
+      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        volatile void *sleep_loc;
+        // If the thread is sleeping, awaken it.
+        if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
+            NULL) {
+          KA_TRACE(
+              10,
+              ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
+               __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
+          __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
+        }
+      }
+    }
+    if (done) {
+      break;
+    }
+
+    // If oversubscribed or have waited a bit, yield.
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+  }
+}
+
+// __kmp_task_team_setup:  Create a task_team for the current team, but use
+// an already created, unused one if it already exists.
+void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+
+  // If this task_team hasn't been created yet, allocate it. It will be used in
+  // the region after the next.
+  // If it exists, it is the current task team and shouldn't be touched yet as
+  // it may still be in use.
+  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
+      (always || team->t.t_nproc > 1)) {
+    team->t.t_task_team[this_thr->th.th_task_state] =
+        __kmp_allocate_task_team(this_thr, team);
+    KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
+                  "for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[this_thr->th.th_task_state],
+                  ((team != NULL) ? team->t.t_id : -1),
+                  this_thr->th.th_task_state));
+  }
+
+  // After threads exit the release, they will call sync, and then point to this
+  // other task_team; make sure it is allocated and properly initialized. As
+  // threads spin in the barrier release phase, they will continue to use the
+  // previous task_team struct(above), until they receive the signal to stop
+  // checking for tasks (they can't safely reference the kmp_team_t struct,
+  // which could be reallocated by the master thread). No task teams are formed
+  // for serialized teams.
+  if (team->t.t_nproc > 1) {
+    int other_team = 1 - this_thr->th.th_task_state;
+    if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
+      team->t.t_task_team[other_team] =
+          __kmp_allocate_task_team(this_thr, team);
+      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
+                    "task_team %p for team %d at parity=%d\n",
+                    __kmp_gtid_from_thread(this_thr),
+                    team->t.t_task_team[other_team],
+                    ((team != NULL) ? team->t.t_id : -1), other_team));
+    } else { // Leave the old task team struct in place for the upcoming region;
+      // adjust as needed
+      kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+      if (!task_team->tt.tt_active ||
+          team->t.t_nproc != task_team->tt.tt_nproc) {
+        TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
+        TCW_4(task_team->tt.tt_found_tasks, FALSE);
+        TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+        KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
+                          team->t.t_nproc);
+        TCW_4(task_team->tt.tt_active, TRUE);
+      }
+      // if team size has changed, the first thread to enable tasking will
+      // realloc threads_data if necessary
+      KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
+                    "%p for team %d at parity=%d\n",
+                    __kmp_gtid_from_thread(this_thr),
+                    team->t.t_task_team[other_team],
+                    ((team != NULL) ? team->t.t_id : -1), other_team));
+    }
+  }
+}
+
+// __kmp_task_team_sync: Propagation of task team data from team to threads
+// which happens just after the release phase of a team barrier.  This may be
+// called by any thread, but only for teams with # threads > 1.
+void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+
+  // Toggle the th_task_state field, to switch which task_team this thread
+  // refers to
+  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
+  // It is now safe to propagate the task team pointer from the team struct to
+  // the current thread.
+  TCW_PTR(this_thr->th.th_task_team,
+          team->t.t_task_team[this_thr->th.th_task_state]);
+  KA_TRACE(20,
+           ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
+            "%p from Team #%d (parity=%d)\n",
+            __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
+            ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
+}
+
+// __kmp_task_team_wait: Master thread waits for outstanding tasks after the
+// barrier gather phase. Only called by master thread if #threads in team > 1 or
+// if proxy tasks were created.
+//
+// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
+// by passing in 0 optionally as the last argument. When wait is zero, master
+// thread does not wait for unfinished_threads to reach 0.
+void __kmp_task_team_wait(
+    kmp_info_t *this_thr,
+    kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
+  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
+
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
+
+  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
+    if (wait) {
+      KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
+                    "(for unfinished_threads to reach 0) on task_team = %p\n",
+                    __kmp_gtid_from_thread(this_thr), task_team));
+      // Worker threads may have dropped through to release phase, but could
+      // still be executing tasks. Wait here for tasks to complete. To avoid
+      // memory contention, only master thread checks termination condition.
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
+                             &task_team->tt.tt_unfinished_threads),
+                       0U);
+      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    // Deactivate the old task team, so that the worker threads will stop
+    // referencing it while spinning.
+    KA_TRACE(
+        20,
+        ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
+         "setting active to false, setting local and team's pointer to NULL\n",
+         __kmp_gtid_from_thread(this_thr), task_team));
+    KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
+                     task_team->tt.tt_found_proxy_tasks == TRUE);
+    TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+    KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
+    TCW_SYNC_4(task_team->tt.tt_active, FALSE);
+    KMP_MB();
+
+    TCW_PTR(this_thr->th.th_task_team, NULL);
+  }
+}
+
+// __kmp_tasking_barrier:
+// This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
+// Internal function to execute all tasks prior to a regular barrier or a join
+// barrier. It is a full barrier itself, which unfortunately turns regular
+// barriers into double barriers and join barriers into 1 1/2 barriers.
+void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
+  std::atomic<kmp_uint32> *spin = RCAST(
+      std::atomic<kmp_uint32> *,
+      &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
+  int flag = FALSE;
+  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
+
+#if USE_ITT_BUILD
+  KMP_FSYNC_SPIN_INIT(spin, NULL);
+#endif /* USE_ITT_BUILD */
+  kmp_flag_32 spin_flag(spin, 0U);
+  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
+                                  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
+#if USE_ITT_BUILD
+    // TODO: What about itt_sync_obj??
+    KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
+#endif /* USE_ITT_BUILD */
+
+    if (TCR_4(__kmp_global.g.g_done)) {
+      if (__kmp_global.g.g_abort)
+        __kmp_abort_thread();
+      break;
+    }
+    KMP_YIELD(TRUE);
+  }
+#if USE_ITT_BUILD
+  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
+#endif /* USE_ITT_BUILD */
+}
+
+// __kmp_give_task puts a task into a given thread queue if:
+//  - the queue for that thread was created
+//  - there's space in that queue
+// Because of this, __kmp_push_task needs to check if there's space after
+// getting the lock
+static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
+                            kmp_int32 pass) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  kmp_task_team_t *task_team = taskdata->td_task_team;
+
+  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
+                taskdata, tid));
+
+  // If task_team is NULL something went really bad...
+  KMP_DEBUG_ASSERT(task_team != NULL);
+
+  bool result = false;
+  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
+
+  if (thread_data->td.td_deque == NULL) {
+    // There's no queue in this thread, go find another one
+    // We're guaranteed that at least one thread has a queue
+    KA_TRACE(30,
+             ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
+              tid, taskdata));
+    return result;
+  }
+
+  if (TCR_4(thread_data->td.td_deque_ntasks) >=
+      TASK_DEQUE_SIZE(thread_data->td)) {
+    KA_TRACE(
+        30,
+        ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
+         taskdata, tid));
+
+    // if this deque is bigger than the pass ratio give a chance to another
+    // thread
+    if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
+      return result;
+
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+    __kmp_realloc_task_deque(thread, thread_data);
+
+  } else {
+
+    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+    if (TCR_4(thread_data->td.td_deque_ntasks) >=
+        TASK_DEQUE_SIZE(thread_data->td)) {
+      KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
+                    "thread %d.\n",
+                    taskdata, tid));
+
+      // if this deque is bigger than the pass ratio give a chance to another
+      // thread
+      if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
+        goto release_and_exit;
+
+      __kmp_realloc_task_deque(thread, thread_data);
+    }
+  }
+
+  // lock is held here, and there is space in the deque
+
+  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
+  // Wrap index.
+  thread_data->td.td_deque_tail =
+      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
+  TCW_4(thread_data->td.td_deque_ntasks,
+        TCR_4(thread_data->td.td_deque_ntasks) + 1);
+
+  result = true;
+  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
+                taskdata, tid));
+
+release_and_exit:
+  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
+
+  return result;
+}
+
+/* The finish of the proxy tasks is divided in two pieces:
+    - the top half is the one that can be done from a thread outside the team
+    - the bottom half must be run from a thread within the team
+
+   In order to run the bottom half the task gets queued back into one of the
+   threads of the team. Once the td_incomplete_child_task counter of the parent
+   is decremented the threads can leave the barriers. So, the bottom half needs
+   to be queued before the counter is decremented. The top half is therefore
+   divided in two parts:
+    - things that can be run before queuing the bottom half
+    - things that must be run after queuing the bottom half
+
+   This creates a second race as the bottom half can free the task before the
+   second top half is executed. To avoid this we use the
+   td_incomplete_child_task of the proxy task to synchronize the top and bottom
+   half. */
+static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
+  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
+
+  taskdata->td_flags.complete = 1; // mark the task as completed
+
+  if (taskdata->td_taskgroup)
+    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+
+  // Create an imaginary children for this task so the bottom half cannot
+  // release the task before we have completed the second top half
+  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
+}
+
+static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
+  kmp_int32 children = 0;
+
+  // Predecrement simulated by "- 1" calculation
+  children =
+      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+  KMP_DEBUG_ASSERT(children >= 0);
+
+  // Remove the imaginary children
+  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
+}
+
+static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
+                   1); // top half must run before bottom half
+
+  // We need to wait to make sure the top half is finished
+  // Spinning here should be ok as this should happen quickly
+  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
+    ;
+
+  __kmp_release_deps(gtid, taskdata);
+  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
+}
+
+/*!
+@ingroup TASKING
+@param gtid Global Thread ID of encountering thread
+@param ptask Task which execution is completed
+
+Execute the completation of a proxy task from a thread of that is part of the
+team. Run first and bottom halves directly.
+*/
+void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
+  KMP_DEBUG_ASSERT(ptask != NULL);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+  KA_TRACE(
+      10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
+           gtid, taskdata));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+
+  __kmp_first_top_half_finish_proxy(taskdata);
+  __kmp_second_top_half_finish_proxy(taskdata);
+  __kmp_bottom_half_finish_proxy(gtid, ptask);
+
+  KA_TRACE(10,
+           ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
+            gtid, taskdata));
+}
+
+/*!
+@ingroup TASKING
+@param ptask Task which execution is completed
+
+Execute the completation of a proxy task from a thread that could not belong to
+the team.
+*/
+void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
+  KMP_DEBUG_ASSERT(ptask != NULL);
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+
+  KA_TRACE(
+      10,
+      ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
+       taskdata));
+
+  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
+
+  __kmp_first_top_half_finish_proxy(taskdata);
+
+  // Enqueue task to complete bottom half completion from a thread within the
+  // corresponding team
+  kmp_team_t *team = taskdata->td_team;
+  kmp_int32 nthreads = team->t.t_nproc;
+  kmp_info_t *thread;
+
+  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
+  // but we cannot use __kmp_get_random here
+  kmp_int32 start_k = 0;
+  kmp_int32 pass = 1;
+  kmp_int32 k = start_k;
+
+  do {
+    // For now we're just linearly trying to find a thread
+    thread = team->t.t_threads[k];
+    k = (k + 1) % nthreads;
+
+    // we did a full pass through all the threads
+    if (k == start_k)
+      pass = pass << 1;
+
+  } while (!__kmp_give_task(thread, k, ptask, pass));
+
+  __kmp_second_top_half_finish_proxy(taskdata);
+
+  KA_TRACE(
+      10,
+      ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
+       taskdata));
+}
+
+kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
+                                                kmp_task_t *task) {
+  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
+    td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
+    td->td_allow_completion_event.ed.task = task;
+    __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
+  }
+  return &td->td_allow_completion_event;
+}
+
+void __kmp_fulfill_event(kmp_event_t *event) {
+  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
+    kmp_task_t *ptask = event->ed.task;
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
+    bool detached = false;
+    int gtid = __kmp_get_gtid();
+
+    if (taskdata->td_flags.proxy == TASK_PROXY) {
+      // The associated task code completed before this call and detached.
+      detached = true;
+      event->type = KMP_EVENT_UNINITIALIZED;
+    } else {
+      // The associated task has not completed but could be completing at this
+      // point.
+      // We need to take the lock to avoid races
+      __kmp_acquire_tas_lock(&event->lock, gtid);
+      if (taskdata->td_flags.proxy == TASK_PROXY)
+        detached = true;
+      event->type = KMP_EVENT_UNINITIALIZED;
+      __kmp_release_tas_lock(&event->lock, gtid);
+    }
+
+    if (detached) {
+      // If the task detached complete the proxy task
+      if (gtid >= 0) {
+        kmp_team_t *team = taskdata->td_team;
+        kmp_info_t *thread = __kmp_get_thread();
+        if (thread->th.th_team == team) {
+          __kmpc_proxy_task_completed(gtid, ptask);
+          return;
+        }
+      }
+
+      // fallback
+      __kmpc_proxy_task_completed_ooo(ptask);
+    }
+  }
+}
+
+// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
+// for taskloop
+//
+// thread:   allocating thread
+// task_src: pointer to source task to be duplicated
+// returns:  a pointer to the allocated kmp_task_t structure (task).
+kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
+  kmp_task_t *task;
+  kmp_taskdata_t *taskdata;
+  kmp_taskdata_t *taskdata_src;
+  kmp_taskdata_t *parent_task = thread->th.th_current_task;
+  size_t shareds_offset;
+  size_t task_size;
+
+  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
+                task_src));
+  taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
+  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
+                   TASK_FULL); // it should not be proxy task
+  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
+  task_size = taskdata_src->td_size_alloc;
+
+  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
+                task_size));
+#if USE_FAST_MEMORY
+  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
+#else
+  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
+#endif /* USE_FAST_MEMORY */
+  KMP_MEMCPY(taskdata, taskdata_src, task_size);
+
+  task = KMP_TASKDATA_TO_TASK(taskdata);
+
+  // Initialize new task (only specific fields not affected by memcpy)
+  taskdata->td_task_id = KMP_GEN_TASK_ID();
+  if (task->shareds != NULL) { // need setup shareds pointer
+    shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
+    task->shareds = &((char *)taskdata)[shareds_offset];
+    KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
+                     0);
+  }
+  taskdata->td_alloc_thread = thread;
+  taskdata->td_parent = parent_task;
+  taskdata->td_taskgroup =
+      parent_task
+          ->td_taskgroup; // task inherits the taskgroup from the parent task
+
+  // Only need to keep track of child task counts if team parallel and tasking
+  // not serialized
+  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
+    if (parent_task->td_taskgroup)
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
+    // Only need to keep track of allocated child tasks for explicit tasks since
+    // implicit not deallocated
+    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
+  }
+
+  KA_TRACE(20,
+           ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
+            thread, taskdata, taskdata->td_parent));
+#if OMPT_SUPPORT
+  if (UNLIKELY(ompt_enabled.enabled))
+    __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
+#endif
+  return task;
+}
+
+// Routine optionally generated by the compiler for setting the lastprivate flag
+// and calling needed constructors for private/firstprivate objects
+// (used to form taskloop tasks from pattern task)
+// Parameters: dest task, src task, lastprivate flag.
+typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+
+KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
+
+// class to encapsulate manipulating loop bounds in a taskloop task.
+// this abstracts away the Intel vs GOMP taskloop interface for setting/getting
+// the loop bound variables.
+class kmp_taskloop_bounds_t {
+  kmp_task_t *task;
+  const kmp_taskdata_t *taskdata;
+  size_t lower_offset;
+  size_t upper_offset;
+
+public:
+  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
+      : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
+        lower_offset((char *)lb - (char *)task),
+        upper_offset((char *)ub - (char *)task) {
+    KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
+    KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
+  }
+  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
+      : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
+        lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
+  size_t get_lower_offset() const { return lower_offset; }
+  size_t get_upper_offset() const { return upper_offset; }
+  kmp_uint64 get_lb() const {
+    kmp_int64 retval;
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just returns the lower bound normally
+    if (!taskdata->td_flags.native) {
+      retval = *(kmp_int64 *)((char *)task + lower_offset);
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
+        retval = (kmp_int64)*lb;
+      } else {
+        kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
+        retval = (kmp_int64)*lb;
+      }
+    }
+#else
+    retval = *(kmp_int64 *)((char *)task + lower_offset);
+#endif // defined(KMP_GOMP_COMPAT)
+    return retval;
+  }
+  kmp_uint64 get_ub() const {
+    kmp_int64 retval;
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just returns the upper bound normally
+    if (!taskdata->td_flags.native) {
+      retval = *(kmp_int64 *)((char *)task + upper_offset);
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
+        retval = (kmp_int64)*ub;
+      } else {
+        kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
+        retval = (kmp_int64)*ub;
+      }
+    }
+#else
+    retval = *(kmp_int64 *)((char *)task + upper_offset);
+#endif // defined(KMP_GOMP_COMPAT)
+    return retval;
+  }
+  void set_lb(kmp_uint64 lb) {
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just sets the lower bound normally
+    if (!taskdata->td_flags.native) {
+      *(kmp_uint64 *)((char *)task + lower_offset) = lb;
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
+        *lower = (kmp_uint32)lb;
+      } else {
+        kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
+        *lower = (kmp_uint64)lb;
+      }
+    }
+#else
+    *(kmp_uint64 *)((char *)task + lower_offset) = lb;
+#endif // defined(KMP_GOMP_COMPAT)
+  }
+  void set_ub(kmp_uint64 ub) {
+#if defined(KMP_GOMP_COMPAT)
+    // Intel task just sets the upper bound normally
+    if (!taskdata->td_flags.native) {
+      *(kmp_uint64 *)((char *)task + upper_offset) = ub;
+    } else {
+      // GOMP task has to take into account the sizeof(long)
+      if (taskdata->td_size_loop_bounds == 4) {
+        kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
+        *upper = (kmp_uint32)ub;
+      } else {
+        kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
+        *upper = (kmp_uint64)ub;
+      }
+    }
+#else
+    *(kmp_uint64 *)((char *)task + upper_offset) = ub;
+#endif // defined(KMP_GOMP_COMPAT)
+  }
+};
+
+// __kmp_taskloop_linear: Start tasks of the taskloop linearly
+//
+// loc        Source location information
+// gtid       Global thread ID
+// task       Pattern task, exposes the loop iteration range
+// lb         Pointer to loop lower bound in task structure
+// ub         Pointer to loop upper bound in task structure
+// st         Loop stride
+// ub_glob    Global upper bound (used for lastprivate check)
+// num_tasks  Number of tasks to execute
+// grainsize  Number of loop iterations per task
+// extras     Number of chunks with grainsize+1 iterations
+// tc         Iterations count
+// task_dup   Tasks duplication routine
+// codeptr_ra Return address for OMPT events
+void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
+                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                           kmp_uint64 grainsize, kmp_uint64 extras,
+                           kmp_uint64 tc,
+#if OMPT_SUPPORT
+                           void *codeptr_ra,
+#endif
+                           void *task_dup) {
+  KMP_COUNT_BLOCK(OMP_TASKLOOP);
+  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
+  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  // compiler provides global bounds here
+  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
+  kmp_uint64 lower = task_bounds.get_lb();
+  kmp_uint64 upper = task_bounds.get_ub();
+  kmp_uint64 i;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  kmp_task_t *next_task;
+  kmp_int32 lastpriv = 0;
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
+                "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
+                gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
+                task_dup));
+
+  // Launch num_tasks tasks, assign grainsize iterations each task
+  for (i = 0; i < num_tasks; ++i) {
+    kmp_uint64 chunk_minus_1;
+    if (extras == 0) {
+      chunk_minus_1 = grainsize - 1;
+    } else {
+      chunk_minus_1 = grainsize;
+      --extras; // first extras iterations get bigger chunk (grainsize+1)
+    }
+    upper = lower + st * chunk_minus_1;
+    if (i == num_tasks - 1) {
+      // schedule the last task, set lastprivate flag if needed
+      if (st == 1) { // most common case
+        KMP_DEBUG_ASSERT(upper == *ub);
+        if (upper == ub_glob)
+          lastpriv = 1;
+      } else if (st > 0) { // positive loop stride
+        KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
+        if ((kmp_uint64)st > ub_glob - upper)
+          lastpriv = 1;
+      } else { // negative loop stride
+        KMP_DEBUG_ASSERT(upper + st < *ub);
+        if (upper - ub_glob < (kmp_uint64)(-st))
+          lastpriv = 1;
+      }
+    }
+    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+    kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
+    kmp_taskloop_bounds_t next_task_bounds =
+        kmp_taskloop_bounds_t(next_task, task_bounds);
+
+    // adjust task-specific bounds
+    next_task_bounds.set_lb(lower);
+    if (next_taskdata->td_flags.native) {
+      next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
+    } else {
+      next_task_bounds.set_ub(upper);
+    }
+    if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
+      ptask_dup(next_task, task, lastpriv);
+    KA_TRACE(40,
+             ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
+              "upper %lld stride %lld, (offsets %p %p)\n",
+              gtid, i, next_task, lower, upper, st,
+              next_task_bounds.get_lower_offset(),
+              next_task_bounds.get_upper_offset()));
+#if OMPT_SUPPORT
+    __kmp_omp_taskloop_task(NULL, gtid, next_task,
+                           codeptr_ra); // schedule new task
+#else
+    __kmp_omp_task(gtid, next_task, true); // schedule new task
+#endif
+    lower = upper + st; // adjust lower bound for the next iteration
+  }
+  // free the pattern task and exit
+  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
+  // do not execute the pattern task, just do internal bookkeeping
+  __kmp_task_finish<false>(gtid, task, current_task);
+}
+
+// Structure to keep taskloop parameters for auxiliary task
+// kept in the shareds of the task structure.
+typedef struct __taskloop_params {
+  kmp_task_t *task;
+  kmp_uint64 *lb;
+  kmp_uint64 *ub;
+  void *task_dup;
+  kmp_int64 st;
+  kmp_uint64 ub_glob;
+  kmp_uint64 num_tasks;
+  kmp_uint64 grainsize;
+  kmp_uint64 extras;
+  kmp_uint64 tc;
+  kmp_uint64 num_t_min;
+#if OMPT_SUPPORT
+  void *codeptr_ra;
+#endif
+} __taskloop_params_t;
+
+void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
+                          kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
+                          kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
+#if OMPT_SUPPORT
+                          void *,
+#endif
+                          void *);
+
+// Execute part of the the taskloop submitted as a task.
+int __kmp_taskloop_task(int gtid, void *ptask) {
+  __taskloop_params_t *p =
+      (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
+  kmp_task_t *task = p->task;
+  kmp_uint64 *lb = p->lb;
+  kmp_uint64 *ub = p->ub;
+  void *task_dup = p->task_dup;
+  //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_int64 st = p->st;
+  kmp_uint64 ub_glob = p->ub_glob;
+  kmp_uint64 num_tasks = p->num_tasks;
+  kmp_uint64 grainsize = p->grainsize;
+  kmp_uint64 extras = p->extras;
+  kmp_uint64 tc = p->tc;
+  kmp_uint64 num_t_min = p->num_t_min;
+#if OMPT_SUPPORT
+  void *codeptr_ra = p->codeptr_ra;
+#endif
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
+                gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
+                task_dup));
+#endif
+  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
+  if (num_tasks > num_t_min)
+    __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_t_min,
+#if OMPT_SUPPORT
+                         codeptr_ra,
+#endif
+                         task_dup);
+  else
+    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc,
+#if OMPT_SUPPORT
+                          codeptr_ra,
+#endif
+                          task_dup);
+
+  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
+  return 0;
+}
+
+// Schedule part of the the taskloop as a task,
+// execute the rest of the the taskloop.
+//
+// loc        Source location information
+// gtid       Global thread ID
+// task       Pattern task, exposes the loop iteration range
+// lb         Pointer to loop lower bound in task structure
+// ub         Pointer to loop upper bound in task structure
+// st         Loop stride
+// ub_glob    Global upper bound (used for lastprivate check)
+// num_tasks  Number of tasks to execute
+// grainsize  Number of loop iterations per task
+// extras     Number of chunks with grainsize+1 iterations
+// tc         Iterations count
+// num_t_min  Threashold to launch tasks recursively
+// task_dup   Tasks duplication routine
+// codeptr_ra Return address for OMPT events
+void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
+                          kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                          kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                          kmp_uint64 grainsize, kmp_uint64 extras,
+                          kmp_uint64 tc, kmp_uint64 num_t_min,
+#if OMPT_SUPPORT
+                          void *codeptr_ra,
+#endif
+                          void *task_dup) {
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
+  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
+                gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
+                task_dup));
+#endif
+  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_uint64 lower = *lb;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  //  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  kmp_task_t *next_task;
+  size_t lower_offset =
+      (char *)lb - (char *)task; // remember offset of lb in the task structure
+  size_t upper_offset =
+      (char *)ub - (char *)task; // remember offset of ub in the task structure
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+
+  // split the loop in two halves
+  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
+  kmp_uint64 gr_size0 = grainsize;
+  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
+  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
+  if (n_tsk0 <= extras) {
+    gr_size0++; // integrate extras into grainsize
+    ext0 = 0; // no extra iters in 1st half
+    ext1 = extras - n_tsk0; // remaining extras
+    tc0 = gr_size0 * n_tsk0;
+    tc1 = tc - tc0;
+  } else { // n_tsk0 > extras
+    ext1 = 0; // no extra iters in 2nd half
+    ext0 = extras;
+    tc1 = grainsize * n_tsk1;
+    tc0 = tc - tc1;
+  }
+  ub0 = lower + st * (tc0 - 1);
+  lb1 = ub0 + st;
+
+  // create pattern task for 2nd half of the loop
+  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+  // adjust lower bound (upper bound is not changed) for the 2nd half
+  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
+  if (ptask_dup != NULL) // construct fistprivates, etc.
+    ptask_dup(next_task, task, 0);
+  *ub = ub0; // adjust upper bound for the 1st half
+
+  // create auxiliary task for 2nd half of the loop
+  kmp_task_t *new_task =
+      __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
+                            sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
+  p->task = next_task;
+  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
+  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
+  p->task_dup = task_dup;
+  p->st = st;
+  p->ub_glob = ub_glob;
+  p->num_tasks = n_tsk1;
+  p->grainsize = grainsize;
+  p->extras = ext1;
+  p->tc = tc1;
+  p->num_t_min = num_t_min;
+#if OMPT_SUPPORT
+  p->codeptr_ra = codeptr_ra;
+#endif
+
+#if OMPT_SUPPORT
+  // schedule new task with correct return address for OMPT events
+  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
+#else
+  __kmp_omp_task(gtid, new_task, true); // schedule new task
+#endif
+
+  // execute the 1st half of current subrange
+  if (n_tsk0 > num_t_min)
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
+                         ext0, tc0, num_t_min,
+#if OMPT_SUPPORT
+                         codeptr_ra,
+#endif
+                         task_dup);
+  else
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                          gr_size0, ext0, tc0,
+#if OMPT_SUPPORT
+                          codeptr_ra,
+#endif
+                          task_dup);
+
+  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound in task structure
+@param ub        Pointer to loop upper bound in task structure
+@param st        Loop stride
+@param nogroup   Flag, 1 if no taskgroup needs to be added, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
+                     int sched, kmp_uint64 grainsize, void *task_dup) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+
+  if (nogroup == 0) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_taskgroup(loc, gtid);
+  }
+
+  // =========================================================================
+  // calculate loop parameters
+  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
+  kmp_uint64 tc;
+  // compiler provides global bounds here
+  kmp_uint64 lower = task_bounds.get_lb();
+  kmp_uint64 upper = task_bounds.get_ub();
+  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
+  kmp_uint64 num_tasks = 0, extras = 0;
+  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
+  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
+                "grain %llu(%d), dup %p\n",
+                gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
+
+  // compute trip count
+  if (st == 1) { // most common case
+    tc = upper - lower + 1;
+  } else if (st < 0) {
+    tc = (lower - upper) / (-st) + 1;
+  } else { // st > 0
+    tc = (upper - lower) / st + 1;
+  }
+  if (tc == 0) {
+    KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
+    // free the pattern task and exit
+    __kmp_task_start(gtid, task, current_task);
+    // do not execute anything for zero-trip loop
+    __kmp_task_finish<false>(gtid, task, current_task);
+    return;
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
+        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (num_tasks_min == 0)
+    // TODO: can we choose better default heuristic?
+    num_tasks_min =
+        KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
+
+  // compute num_tasks/grainsize based on the input provided
+  switch (sched) {
+  case 0: // no schedule clause specified, we can choose the default
+    // let's try to schedule (team_size*10) tasks
+    grainsize = thread->th.th_team_nproc * 10;
+    KMP_FALLTHROUGH();
+  case 2: // num_tasks provided
+    if (grainsize > tc) {
+      num_tasks = tc; // too big num_tasks requested, adjust values
+      grainsize = 1;
+      extras = 0;
+    } else {
+      num_tasks = grainsize;
+      grainsize = tc / num_tasks;
+      extras = tc % num_tasks;
+    }
+    break;
+  case 1: // grainsize provided
+    if (grainsize > tc) {
+      num_tasks = 1; // too big grainsize requested, adjust values
+      grainsize = tc;
+      extras = 0;
+    } else {
+      num_tasks = tc / grainsize;
+      // adjust grainsize for balanced distribution of iterations
+      grainsize = tc / num_tasks;
+      extras = tc % num_tasks;
+    }
+    break;
+  default:
+    KMP_ASSERT2(0, "unknown scheduling of taskloop");
+  }
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+  // =========================================================================
+
+  // check if clause value first
+  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
+  if (if_val == 0) { // if(0) specified, mark task as serial
+    taskdata->td_flags.task_serial = 1;
+    taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
+    // always start serial tasks linearly
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc,
+#if OMPT_SUPPORT
+                          OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                          task_dup);
+    // !taskdata->td_flags.native => currently force linear spawning of tasks
+    // for GOMP_taskloop
+  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n",
+                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_tasks_min,
+#if OMPT_SUPPORT
+                         OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                         task_dup);
+  } else {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n",
+                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc,
+#if OMPT_SUPPORT
+                          OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                          task_dup);
+  }
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+  if (ompt_enabled.ompt_callback_work) {
+    ompt_callbacks.ompt_callback(ompt_callback_work)(
+        ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
+        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
+  }
+#endif
+
+  if (nogroup == 0) {
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+    OMPT_STORE_RETURN_ADDRESS(gtid);
+#endif
+    __kmpc_end_taskgroup(loc, gtid);
+  }
+  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+}
diff --git a/final/runtime/src/kmp_taskq.cpp b/final/runtime/src/kmp_taskq.cpp
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/final/runtime/src/kmp_taskq.cpp
diff --git a/final/runtime/src/kmp_threadprivate.cpp b/final/runtime/src/kmp_threadprivate.cpp
new file mode 100644
index 0000000..87bfff3
--- /dev/null
+++ b/final/runtime/src/kmp_threadprivate.cpp
@@ -0,0 +1,799 @@
+/*
+ * kmp_threadprivate.cpp -- OpenMP threadprivate support library
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+
+#define USE_CHECKS_COMMON
+
+#define KMP_INLINE_SUBR 1
+
+void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
+                                           void *data_addr, size_t pc_size);
+struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
+                                                void *data_addr,
+                                                size_t pc_size);
+
+struct shared_table __kmp_threadprivate_d_table;
+
+static
+#ifdef KMP_INLINE_SUBR
+    __forceinline
+#endif
+    struct private_common *
+    __kmp_threadprivate_find_task_common(struct common_table *tbl, int gtid,
+                                         void *pc_addr)
+
+{
+  struct private_common *tn;
+
+#ifdef KMP_TASK_COMMON_DEBUG
+  KC_TRACE(10, ("__kmp_threadprivate_find_task_common: thread#%d, called with "
+                "address %p\n",
+                gtid, pc_addr));
+  dump_list();
+#endif
+
+  for (tn = tbl->data[KMP_HASH(pc_addr)]; tn; tn = tn->next) {
+    if (tn->gbl_addr == pc_addr) {
+#ifdef KMP_TASK_COMMON_DEBUG
+      KC_TRACE(10, ("__kmp_threadprivate_find_task_common: thread#%d, found "
+                    "node %p on list\n",
+                    gtid, pc_addr));
+#endif
+      return tn;
+    }
+  }
+  return 0;
+}
+
+static
+#ifdef KMP_INLINE_SUBR
+    __forceinline
+#endif
+    struct shared_common *
+    __kmp_find_shared_task_common(struct shared_table *tbl, int gtid,
+                                  void *pc_addr) {
+  struct shared_common *tn;
+
+  for (tn = tbl->data[KMP_HASH(pc_addr)]; tn; tn = tn->next) {
+    if (tn->gbl_addr == pc_addr) {
+#ifdef KMP_TASK_COMMON_DEBUG
+      KC_TRACE(
+          10,
+          ("__kmp_find_shared_task_common: thread#%d, found node %p on list\n",
+           gtid, pc_addr));
+#endif
+      return tn;
+    }
+  }
+  return 0;
+}
+
+// Create a template for the data initialized storage. Either the template is
+// NULL indicating zero fill, or the template is a copy of the original data.
+static struct private_data *__kmp_init_common_data(void *pc_addr,
+                                                   size_t pc_size) {
+  struct private_data *d;
+  size_t i;
+  char *p;
+
+  d = (struct private_data *)__kmp_allocate(sizeof(struct private_data));
+  /*
+      d->data = 0;  // AC: commented out because __kmp_allocate zeroes the
+     memory
+      d->next = 0;
+  */
+  d->size = pc_size;
+  d->more = 1;
+
+  p = (char *)pc_addr;
+
+  for (i = pc_size; i > 0; --i) {
+    if (*p++ != '\0') {
+      d->data = __kmp_allocate(pc_size);
+      KMP_MEMCPY(d->data, pc_addr, pc_size);
+      break;
+    }
+  }
+
+  return d;
+}
+
+// Initialize the data area from the template.
+static void __kmp_copy_common_data(void *pc_addr, struct private_data *d) {
+  char *addr = (char *)pc_addr;
+  int i, offset;
+
+  for (offset = 0; d != 0; d = d->next) {
+    for (i = d->more; i > 0; --i) {
+      if (d->data == 0)
+        memset(&addr[offset], '\0', d->size);
+      else
+        KMP_MEMCPY(&addr[offset], d->data, d->size);
+      offset += d->size;
+    }
+  }
+}
+
+/* we are called from __kmp_serial_initialize() with __kmp_initz_lock held. */
+void __kmp_common_initialize(void) {
+  if (!TCR_4(__kmp_init_common)) {
+    int q;
+#ifdef KMP_DEBUG
+    int gtid;
+#endif
+
+    __kmp_threadpriv_cache_list = NULL;
+
+#ifdef KMP_DEBUG
+    /* verify the uber masters were initialized */
+    for (gtid = 0; gtid < __kmp_threads_capacity; gtid++)
+      if (__kmp_root[gtid]) {
+        KMP_DEBUG_ASSERT(__kmp_root[gtid]->r.r_uber_thread);
+        for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q)
+          KMP_DEBUG_ASSERT(
+              !__kmp_root[gtid]->r.r_uber_thread->th.th_pri_common->data[q]);
+        /*                    __kmp_root[ gitd ]-> r.r_uber_thread ->
+         * th.th_pri_common -> data[ q ] = 0;*/
+      }
+#endif /* KMP_DEBUG */
+
+    for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q)
+      __kmp_threadprivate_d_table.data[q] = 0;
+
+    TCW_4(__kmp_init_common, TRUE);
+  }
+}
+
+/* Call all destructors for threadprivate data belonging to all threads.
+   Currently unused! */
+void __kmp_common_destroy(void) {
+  if (TCR_4(__kmp_init_common)) {
+    int q;
+
+    TCW_4(__kmp_init_common, FALSE);
+
+    for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q) {
+      int gtid;
+      struct private_common *tn;
+      struct shared_common *d_tn;
+
+      /* C++ destructors need to be called once per thread before exiting.
+         Don't call destructors for master thread though unless we used copy
+         constructor */
+
+      for (d_tn = __kmp_threadprivate_d_table.data[q]; d_tn;
+           d_tn = d_tn->next) {
+        if (d_tn->is_vec) {
+          if (d_tn->dt.dtorv != 0) {
+            for (gtid = 0; gtid < __kmp_all_nth; ++gtid) {
+              if (__kmp_threads[gtid]) {
+                if ((__kmp_foreign_tp) ? (!KMP_INITIAL_GTID(gtid))
+                                       : (!KMP_UBER_GTID(gtid))) {
+                  tn = __kmp_threadprivate_find_task_common(
+                      __kmp_threads[gtid]->th.th_pri_common, gtid,
+                      d_tn->gbl_addr);
+                  if (tn) {
+                    (*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
+                  }
+                }
+              }
+            }
+            if (d_tn->obj_init != 0) {
+              (*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+            }
+          }
+        } else {
+          if (d_tn->dt.dtor != 0) {
+            for (gtid = 0; gtid < __kmp_all_nth; ++gtid) {
+              if (__kmp_threads[gtid]) {
+                if ((__kmp_foreign_tp) ? (!KMP_INITIAL_GTID(gtid))
+                                       : (!KMP_UBER_GTID(gtid))) {
+                  tn = __kmp_threadprivate_find_task_common(
+                      __kmp_threads[gtid]->th.th_pri_common, gtid,
+                      d_tn->gbl_addr);
+                  if (tn) {
+                    (*d_tn->dt.dtor)(tn->par_addr);
+                  }
+                }
+              }
+            }
+            if (d_tn->obj_init != 0) {
+              (*d_tn->dt.dtor)(d_tn->obj_init);
+            }
+          }
+        }
+      }
+      __kmp_threadprivate_d_table.data[q] = 0;
+    }
+  }
+}
+
+/* Call all destructors for threadprivate data belonging to this thread */
+void __kmp_common_destroy_gtid(int gtid) {
+  struct private_common *tn;
+  struct shared_common *d_tn;
+
+  if (!TCR_4(__kmp_init_gtid)) {
+    // This is possible when one of multiple roots initiates early library
+    // termination in a sequential region while other teams are active, and its
+    // child threads are about to end.
+    return;
+  }
+
+  KC_TRACE(10, ("__kmp_common_destroy_gtid: T#%d called\n", gtid));
+  if ((__kmp_foreign_tp) ? (!KMP_INITIAL_GTID(gtid)) : (!KMP_UBER_GTID(gtid))) {
+
+    if (TCR_4(__kmp_init_common)) {
+
+      /* Cannot do this here since not all threads have destroyed their data */
+      /* TCW_4(__kmp_init_common, FALSE); */
+
+      for (tn = __kmp_threads[gtid]->th.th_pri_head; tn; tn = tn->link) {
+
+        d_tn = __kmp_find_shared_task_common(&__kmp_threadprivate_d_table, gtid,
+                                             tn->gbl_addr);
+
+        KMP_DEBUG_ASSERT(d_tn);
+
+        if (d_tn->is_vec) {
+          if (d_tn->dt.dtorv != 0) {
+            (void)(*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
+          }
+          if (d_tn->obj_init != 0) {
+            (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+          }
+        } else {
+          if (d_tn->dt.dtor != 0) {
+            (void)(*d_tn->dt.dtor)(tn->par_addr);
+          }
+          if (d_tn->obj_init != 0) {
+            (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+          }
+        }
+      }
+      KC_TRACE(30, ("__kmp_common_destroy_gtid: T#%d threadprivate destructors "
+                    "complete\n",
+                    gtid));
+    }
+  }
+}
+
+#ifdef KMP_TASK_COMMON_DEBUG
+static void dump_list(void) {
+  int p, q;
+
+  for (p = 0; p < __kmp_all_nth; ++p) {
+    if (!__kmp_threads[p])
+      continue;
+    for (q = 0; q < KMP_HASH_TABLE_SIZE; ++q) {
+      if (__kmp_threads[p]->th.th_pri_common->data[q]) {
+        struct private_common *tn;
+
+        KC_TRACE(10, ("\tdump_list: gtid:%d addresses\n", p));
+
+        for (tn = __kmp_threads[p]->th.th_pri_common->data[q]; tn;
+             tn = tn->next) {
+          KC_TRACE(10,
+                   ("\tdump_list: THREADPRIVATE: Serial %p -> Parallel %p\n",
+                    tn->gbl_addr, tn->par_addr));
+        }
+      }
+    }
+  }
+}
+#endif /* KMP_TASK_COMMON_DEBUG */
+
+// NOTE: this routine is to be called only from the serial part of the program.
+void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
+                                           void *data_addr, size_t pc_size) {
+  struct shared_common **lnk_tn, *d_tn;
+  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
+                   __kmp_threads[gtid]->th.th_root->r.r_active == 0);
+
+  d_tn = __kmp_find_shared_task_common(&__kmp_threadprivate_d_table, gtid,
+                                       pc_addr);
+
+  if (d_tn == 0) {
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+
+    d_tn->gbl_addr = pc_addr;
+    d_tn->pod_init = __kmp_init_common_data(data_addr, pc_size);
+    /*
+            d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate
+       zeroes the memory
+            d_tn->ct.ctor = 0;
+            d_tn->cct.cctor = 0;;
+            d_tn->dt.dtor = 0;
+            d_tn->is_vec = FALSE;
+            d_tn->vec_len = 0L;
+    */
+    d_tn->cmn_size = pc_size;
+
+    __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(pc_addr)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+
+    __kmp_release_lock(&__kmp_global_lock, gtid);
+  }
+}
+
+struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
+                                                void *data_addr,
+                                                size_t pc_size) {
+  struct private_common *tn, **tt;
+  struct shared_common *d_tn;
+
+  /* +++++++++ START OF CRITICAL SECTION +++++++++ */
+  __kmp_acquire_lock(&__kmp_global_lock, gtid);
+
+  tn = (struct private_common *)__kmp_allocate(sizeof(struct private_common));
+
+  tn->gbl_addr = pc_addr;
+
+  d_tn = __kmp_find_shared_task_common(
+      &__kmp_threadprivate_d_table, gtid,
+      pc_addr); /* Only the MASTER data table exists. */
+
+  if (d_tn != 0) {
+    /* This threadprivate variable has already been seen. */
+
+    if (d_tn->pod_init == 0 && d_tn->obj_init == 0) {
+      d_tn->cmn_size = pc_size;
+
+      if (d_tn->is_vec) {
+        if (d_tn->ct.ctorv != 0) {
+          /* Construct from scratch so no prototype exists */
+          d_tn->obj_init = 0;
+        } else if (d_tn->cct.cctorv != 0) {
+          /* Now data initialize the prototype since it was previously
+           * registered */
+          d_tn->obj_init = (void *)__kmp_allocate(d_tn->cmn_size);
+          (void)(*d_tn->cct.cctorv)(d_tn->obj_init, pc_addr, d_tn->vec_len);
+        } else {
+          d_tn->pod_init = __kmp_init_common_data(data_addr, d_tn->cmn_size);
+        }
+      } else {
+        if (d_tn->ct.ctor != 0) {
+          /* Construct from scratch so no prototype exists */
+          d_tn->obj_init = 0;
+        } else if (d_tn->cct.cctor != 0) {
+          /* Now data initialize the prototype since it was previously
+             registered */
+          d_tn->obj_init = (void *)__kmp_allocate(d_tn->cmn_size);
+          (void)(*d_tn->cct.cctor)(d_tn->obj_init, pc_addr);
+        } else {
+          d_tn->pod_init = __kmp_init_common_data(data_addr, d_tn->cmn_size);
+        }
+      }
+    }
+  } else {
+    struct shared_common **lnk_tn;
+
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+    d_tn->gbl_addr = pc_addr;
+    d_tn->cmn_size = pc_size;
+    d_tn->pod_init = __kmp_init_common_data(data_addr, pc_size);
+    /*
+            d_tn->obj_init = 0;  // AC: commented out because __kmp_allocate
+       zeroes the memory
+            d_tn->ct.ctor = 0;
+            d_tn->cct.cctor = 0;
+            d_tn->dt.dtor = 0;
+            d_tn->is_vec = FALSE;
+            d_tn->vec_len = 0L;
+    */
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(pc_addr)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+  }
+
+  tn->cmn_size = d_tn->cmn_size;
+
+  if ((__kmp_foreign_tp) ? (KMP_INITIAL_GTID(gtid)) : (KMP_UBER_GTID(gtid))) {
+    tn->par_addr = (void *)pc_addr;
+  } else {
+    tn->par_addr = (void *)__kmp_allocate(tn->cmn_size);
+  }
+
+  __kmp_release_lock(&__kmp_global_lock, gtid);
+/* +++++++++ END OF CRITICAL SECTION +++++++++ */
+
+#ifdef USE_CHECKS_COMMON
+  if (pc_size > d_tn->cmn_size) {
+    KC_TRACE(
+        10, ("__kmp_threadprivate_insert: THREADPRIVATE: %p (%" KMP_UINTPTR_SPEC
+             " ,%" KMP_UINTPTR_SPEC ")\n",
+             pc_addr, pc_size, d_tn->cmn_size));
+    KMP_FATAL(TPCommonBlocksInconsist);
+  }
+#endif /* USE_CHECKS_COMMON */
+
+  tt = &(__kmp_threads[gtid]->th.th_pri_common->data[KMP_HASH(pc_addr)]);
+
+#ifdef KMP_TASK_COMMON_DEBUG
+  if (*tt != 0) {
+    KC_TRACE(
+        10,
+        ("__kmp_threadprivate_insert: WARNING! thread#%d: collision on %p\n",
+         gtid, pc_addr));
+  }
+#endif
+  tn->next = *tt;
+  *tt = tn;
+
+#ifdef KMP_TASK_COMMON_DEBUG
+  KC_TRACE(10,
+           ("__kmp_threadprivate_insert: thread#%d, inserted node %p on list\n",
+            gtid, pc_addr));
+  dump_list();
+#endif
+
+  /* Link the node into a simple list */
+
+  tn->link = __kmp_threads[gtid]->th.th_pri_head;
+  __kmp_threads[gtid]->th.th_pri_head = tn;
+
+  if ((__kmp_foreign_tp) ? (KMP_INITIAL_GTID(gtid)) : (KMP_UBER_GTID(gtid)))
+    return tn;
+
+  /* if C++ object with copy constructor, use it;
+   * else if C++ object with constructor, use it for the non-master copies only;
+   * else use pod_init and memcpy
+   *
+   * C++ constructors need to be called once for each non-master thread on
+   * allocate
+   * C++ copy constructors need to be called once for each thread on allocate */
+
+  /* C++ object with constructors/destructors; don't call constructors for
+     master thread though */
+  if (d_tn->is_vec) {
+    if (d_tn->ct.ctorv != 0) {
+      (void)(*d_tn->ct.ctorv)(tn->par_addr, d_tn->vec_len);
+    } else if (d_tn->cct.cctorv != 0) {
+      (void)(*d_tn->cct.cctorv)(tn->par_addr, d_tn->obj_init, d_tn->vec_len);
+    } else if (tn->par_addr != tn->gbl_addr) {
+      __kmp_copy_common_data(tn->par_addr, d_tn->pod_init);
+    }
+  } else {
+    if (d_tn->ct.ctor != 0) {
+      (void)(*d_tn->ct.ctor)(tn->par_addr);
+    } else if (d_tn->cct.cctor != 0) {
+      (void)(*d_tn->cct.cctor)(tn->par_addr, d_tn->obj_init);
+    } else if (tn->par_addr != tn->gbl_addr) {
+      __kmp_copy_common_data(tn->par_addr, d_tn->pod_init);
+    }
+  }
+  /* !BUILD_OPENMP_C
+      if (tn->par_addr != tn->gbl_addr)
+          __kmp_copy_common_data( tn->par_addr, d_tn->pod_init ); */
+
+  return tn;
+}
+
+/* ------------------------------------------------------------------------ */
+/* We are currently parallel, and we know the thread id.                    */
+/* ------------------------------------------------------------------------ */
+
+/*!
+ @ingroup THREADPRIVATE
+
+ @param loc source location information
+ @param data  pointer to data being privatized
+ @param ctor  pointer to constructor function for data
+ @param cctor  pointer to copy constructor function for data
+ @param dtor  pointer to destructor function for data
+
+ Register constructors and destructors for thread private data.
+ This function is called when executing in parallel, when we know the thread id.
+*/
+void __kmpc_threadprivate_register(ident_t *loc, void *data, kmpc_ctor ctor,
+                                   kmpc_cctor cctor, kmpc_dtor dtor) {
+  struct shared_common *d_tn, **lnk_tn;
+
+  KC_TRACE(10, ("__kmpc_threadprivate_register: called\n"));
+
+#ifdef USE_CHECKS_COMMON
+  /* copy constructor must be zero for current code gen (Nov 2002 - jph) */
+  KMP_ASSERT(cctor == 0);
+#endif /* USE_CHECKS_COMMON */
+
+  /* Only the global data table exists. */
+  d_tn = __kmp_find_shared_task_common(&__kmp_threadprivate_d_table, -1, data);
+
+  if (d_tn == 0) {
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+    d_tn->gbl_addr = data;
+
+    d_tn->ct.ctor = ctor;
+    d_tn->cct.cctor = cctor;
+    d_tn->dt.dtor = dtor;
+    /*
+            d_tn->is_vec = FALSE;  // AC: commented out because __kmp_allocate
+       zeroes the memory
+            d_tn->vec_len = 0L;
+            d_tn->obj_init = 0;
+            d_tn->pod_init = 0;
+    */
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(data)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+  }
+}
+
+void *__kmpc_threadprivate(ident_t *loc, kmp_int32 global_tid, void *data,
+                           size_t size) {
+  void *ret;
+  struct private_common *tn;
+
+  KC_TRACE(10, ("__kmpc_threadprivate: T#%d called\n", global_tid));
+
+#ifdef USE_CHECKS_COMMON
+  if (!__kmp_init_serial)
+    KMP_FATAL(RTLNotInitialized);
+#endif /* USE_CHECKS_COMMON */
+
+  if (!__kmp_threads[global_tid]->th.th_root->r.r_active && !__kmp_foreign_tp) {
+    /* The parallel address will NEVER overlap with the data_address */
+    /* dkp: 3rd arg to kmp_threadprivate_insert_private_data() is the
+     * data_address; use data_address = data */
+
+    KC_TRACE(20, ("__kmpc_threadprivate: T#%d inserting private data\n",
+                  global_tid));
+    kmp_threadprivate_insert_private_data(global_tid, data, data, size);
+
+    ret = data;
+  } else {
+    KC_TRACE(
+        50,
+        ("__kmpc_threadprivate: T#%d try to find private data at address %p\n",
+         global_tid, data));
+    tn = __kmp_threadprivate_find_task_common(
+        __kmp_threads[global_tid]->th.th_pri_common, global_tid, data);
+
+    if (tn) {
+      KC_TRACE(20, ("__kmpc_threadprivate: T#%d found data\n", global_tid));
+#ifdef USE_CHECKS_COMMON
+      if ((size_t)size > tn->cmn_size) {
+        KC_TRACE(10, ("THREADPRIVATE: %p (%" KMP_UINTPTR_SPEC
+                      " ,%" KMP_UINTPTR_SPEC ")\n",
+                      data, size, tn->cmn_size));
+        KMP_FATAL(TPCommonBlocksInconsist);
+      }
+#endif /* USE_CHECKS_COMMON */
+    } else {
+      /* The parallel address will NEVER overlap with the data_address */
+      /* dkp: 3rd arg to kmp_threadprivate_insert() is the data_address; use
+       * data_address = data */
+      KC_TRACE(20, ("__kmpc_threadprivate: T#%d inserting data\n", global_tid));
+      tn = kmp_threadprivate_insert(global_tid, data, data, size);
+    }
+
+    ret = tn->par_addr;
+  }
+  KC_TRACE(10, ("__kmpc_threadprivate: T#%d exiting; return value = %p\n",
+                global_tid, ret));
+
+  return ret;
+}
+
+static kmp_cached_addr_t *__kmp_find_cache(void *data) {
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+  while (ptr && ptr->data != data)
+    ptr = ptr->next;
+  return ptr;
+}
+
+/*!
+ @ingroup THREADPRIVATE
+ @param loc source location information
+ @param global_tid  global thread number
+ @param data  pointer to data to privatize
+ @param size  size of data to privatize
+ @param cache  pointer to cache
+ @return pointer to private storage
+
+ Allocate private storage for threadprivate data.
+*/
+void *
+__kmpc_threadprivate_cached(ident_t *loc,
+                            kmp_int32 global_tid, // gtid.
+                            void *data, // Pointer to original global variable.
+                            size_t size, // Size of original global variable.
+                            void ***cache) {
+  KC_TRACE(10, ("__kmpc_threadprivate_cached: T#%d called with cache: %p, "
+                "address: %p, size: %" KMP_SIZE_T_SPEC "\n",
+                global_tid, *cache, data, size));
+
+  if (TCR_PTR(*cache) == 0) {
+    __kmp_acquire_lock(&__kmp_global_lock, global_tid);
+
+    if (TCR_PTR(*cache) == 0) {
+      __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
+      // Compiler often passes in NULL cache, even if it's already been created
+      void **my_cache;
+      kmp_cached_addr_t *tp_cache_addr;
+      // Look for an existing cache
+      tp_cache_addr = __kmp_find_cache(data);
+      if (!tp_cache_addr) { // Cache was never created; do it now
+        __kmp_tp_cached = 1;
+        KMP_ITT_IGNORE(my_cache = (void **)__kmp_allocate(
+                           sizeof(void *) * __kmp_tp_capacity +
+                           sizeof(kmp_cached_addr_t)););
+        // No need to zero the allocated memory; __kmp_allocate does that.
+        KC_TRACE(50, ("__kmpc_threadprivate_cached: T#%d allocated cache at "
+                      "address %p\n",
+                      global_tid, my_cache));
+        /* TODO: free all this memory in __kmp_common_destroy using
+         * __kmp_threadpriv_cache_list */
+        /* Add address of mycache to linked list for cleanup later  */
+        tp_cache_addr = (kmp_cached_addr_t *)&my_cache[__kmp_tp_capacity];
+        tp_cache_addr->addr = my_cache;
+        tp_cache_addr->data = data;
+        tp_cache_addr->compiler_cache = cache;
+        tp_cache_addr->next = __kmp_threadpriv_cache_list;
+        __kmp_threadpriv_cache_list = tp_cache_addr;
+      } else { // A cache was already created; use it
+        my_cache = tp_cache_addr->addr;
+        tp_cache_addr->compiler_cache = cache;
+      }
+      KMP_MB();
+
+      TCW_PTR(*cache, my_cache);
+      __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
+
+      KMP_MB();
+    }
+    __kmp_release_lock(&__kmp_global_lock, global_tid);
+  }
+
+  void *ret;
+  if ((ret = TCR_PTR((*cache)[global_tid])) == 0) {
+    ret = __kmpc_threadprivate(loc, global_tid, data, (size_t)size);
+
+    TCW_PTR((*cache)[global_tid], ret);
+  }
+  KC_TRACE(10,
+           ("__kmpc_threadprivate_cached: T#%d exiting; return value = %p\n",
+            global_tid, ret));
+  return ret;
+}
+
+// This function should only be called when both __kmp_tp_cached_lock and
+// kmp_forkjoin_lock are held.
+void __kmp_threadprivate_resize_cache(int newCapacity) {
+  KC_TRACE(10, ("__kmp_threadprivate_resize_cache: called with size: %d\n",
+                newCapacity));
+
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+  while (ptr) {
+    if (ptr->data) { // this location has an active cache; resize it
+      void **my_cache;
+      KMP_ITT_IGNORE(my_cache =
+                         (void **)__kmp_allocate(sizeof(void *) * newCapacity +
+                                                 sizeof(kmp_cached_addr_t)););
+      // No need to zero the allocated memory; __kmp_allocate does that.
+      KC_TRACE(50, ("__kmp_threadprivate_resize_cache: allocated cache at %p\n",
+                    my_cache));
+      // Now copy old cache into new cache
+      void **old_cache = ptr->addr;
+      for (int i = 0; i < __kmp_tp_capacity; ++i) {
+        my_cache[i] = old_cache[i];
+      }
+
+      // Add address of new my_cache to linked list for cleanup later
+      kmp_cached_addr_t *tp_cache_addr;
+      tp_cache_addr = (kmp_cached_addr_t *)&my_cache[newCapacity];
+      tp_cache_addr->addr = my_cache;
+      tp_cache_addr->data = ptr->data;
+      tp_cache_addr->compiler_cache = ptr->compiler_cache;
+      tp_cache_addr->next = __kmp_threadpriv_cache_list;
+      __kmp_threadpriv_cache_list = tp_cache_addr;
+
+      // Copy new cache to compiler's location: We can copy directly
+      // to (*compiler_cache) if compiler guarantees it will keep
+      // using the same location for the cache. This is not yet true
+      // for some compilers, in which case we have to check if
+      // compiler_cache is still pointing at old cache, and if so, we
+      // can point it at the new cache with an atomic compare&swap
+      // operation. (Old method will always work, but we should shift
+      // to new method (commented line below) when Intel and Clang
+      // compilers use new method.)
+      (void)KMP_COMPARE_AND_STORE_PTR(tp_cache_addr->compiler_cache, old_cache,
+                                      my_cache);
+      // TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
+
+      // If the store doesn't happen here, the compiler's old behavior will
+      // inevitably call __kmpc_threadprivate_cache with a new location for the
+      // cache, and that function will store the resized cache there at that
+      // point.
+
+      // Nullify old cache's data pointer so we skip it next time
+      ptr->data = NULL;
+    }
+    ptr = ptr->next;
+  }
+  // After all caches are resized, update __kmp_tp_capacity to the new size
+  *(volatile int *)&__kmp_tp_capacity = newCapacity;
+}
+
+/*!
+ @ingroup THREADPRIVATE
+ @param loc source location information
+ @param data  pointer to data being privatized
+ @param ctor  pointer to constructor function for data
+ @param cctor  pointer to copy constructor function for data
+ @param dtor  pointer to destructor function for data
+ @param vector_length length of the vector (bytes or elements?)
+ Register vector constructors and destructors for thread private data.
+*/
+void __kmpc_threadprivate_register_vec(ident_t *loc, void *data,
+                                       kmpc_ctor_vec ctor, kmpc_cctor_vec cctor,
+                                       kmpc_dtor_vec dtor,
+                                       size_t vector_length) {
+  struct shared_common *d_tn, **lnk_tn;
+
+  KC_TRACE(10, ("__kmpc_threadprivate_register_vec: called\n"));
+
+#ifdef USE_CHECKS_COMMON
+  /* copy constructor must be zero for current code gen (Nov 2002 - jph) */
+  KMP_ASSERT(cctor == 0);
+#endif /* USE_CHECKS_COMMON */
+
+  d_tn = __kmp_find_shared_task_common(
+      &__kmp_threadprivate_d_table, -1,
+      data); /* Only the global data table exists. */
+
+  if (d_tn == 0) {
+    d_tn = (struct shared_common *)__kmp_allocate(sizeof(struct shared_common));
+    d_tn->gbl_addr = data;
+
+    d_tn->ct.ctorv = ctor;
+    d_tn->cct.cctorv = cctor;
+    d_tn->dt.dtorv = dtor;
+    d_tn->is_vec = TRUE;
+    d_tn->vec_len = (size_t)vector_length;
+    // d_tn->obj_init = 0;  // AC: __kmp_allocate zeroes the memory
+    // d_tn->pod_init = 0;
+    lnk_tn = &(__kmp_threadprivate_d_table.data[KMP_HASH(data)]);
+
+    d_tn->next = *lnk_tn;
+    *lnk_tn = d_tn;
+  }
+}
+
+void __kmp_cleanup_threadprivate_caches() {
+  kmp_cached_addr_t *ptr = __kmp_threadpriv_cache_list;
+
+  while (ptr) {
+    void **cache = ptr->addr;
+    __kmp_threadpriv_cache_list = ptr->next;
+    if (*ptr->compiler_cache)
+      *ptr->compiler_cache = NULL;
+    ptr->compiler_cache = NULL;
+    ptr->data = NULL;
+    ptr->addr = NULL;
+    ptr->next = NULL;
+    // Threadprivate data pointed at by cache entries are destroyed at end of
+    // __kmp_launch_thread with __kmp_common_destroy_gtid.
+    __kmp_free(cache); // implicitly frees ptr too
+    ptr = __kmp_threadpriv_cache_list;
+  }
+}
diff --git a/final/runtime/src/kmp_utility.cpp b/final/runtime/src/kmp_utility.cpp
new file mode 100644
index 0000000..44a99d0
--- /dev/null
+++ b/final/runtime/src/kmp_utility.cpp
@@ -0,0 +1,409 @@
+/*
+ * kmp_utility.cpp -- Utility routines for the OpenMP support library.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_str.h"
+#include "kmp_wrapper_getpid.h"
+#include <float.h>
+
+static const char *unknown = "unknown";
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+/* NOTE: If called before serial_initialize (i.e. from runtime_initialize), then
+   the debugging package has not been initialized yet, and only "0" will print
+   debugging output since the environment variables have not been read. */
+
+#ifdef KMP_DEBUG
+static int trace_level = 5;
+#endif
+
+/* LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
+ * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
+ * PHY_ID       = APIC_ID >> LOG_ID_BITS
+ */
+int __kmp_get_physical_id(int log_per_phy, int apic_id) {
+  int index_lsb, index_msb, temp;
+
+  if (log_per_phy > 1) {
+    index_lsb = 0;
+    index_msb = 31;
+
+    temp = log_per_phy;
+    while ((temp & 1) == 0) {
+      temp >>= 1;
+      index_lsb++;
+    }
+
+    temp = log_per_phy;
+    while ((temp & 0x80000000) == 0) {
+      temp <<= 1;
+      index_msb--;
+    }
+
+    /* If >1 bits were set in log_per_phy, choose next higher power of 2 */
+    if (index_lsb != index_msb)
+      index_msb++;
+
+    return ((int)(apic_id >> index_msb));
+  }
+
+  return apic_id;
+}
+
+/*
+ * LOG_ID_BITS  = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
+ * APIC_ID      = (PHY_ID << LOG_ID_BITS) | LOG_ID
+ * LOG_ID       = APIC_ID & (( 1 << LOG_ID_BITS ) - 1 )
+ */
+int __kmp_get_logical_id(int log_per_phy, int apic_id) {
+  unsigned current_bit;
+  int bits_seen;
+
+  if (log_per_phy <= 1)
+    return (0);
+
+  bits_seen = 0;
+
+  for (current_bit = 1; log_per_phy != 0; current_bit <<= 1) {
+    if (log_per_phy & current_bit) {
+      log_per_phy &= ~current_bit;
+      bits_seen++;
+    }
+  }
+
+  /* If exactly 1 bit was set in log_per_phy, choose next lower power of 2 */
+  if (bits_seen == 1) {
+    current_bit >>= 1;
+  }
+
+  return ((int)((current_bit - 1) & apic_id));
+}
+
+static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
+    char const *frequency // I: Float number and unit: MHz, GHz, or TGz.
+    ) {
+
+  double value = 0.0;
+  char *unit = NULL;
+  kmp_uint64 result = 0; /* Zero is a better unknown value than all ones. */
+
+  if (frequency == NULL) {
+    return result;
+  }
+  value = strtod(frequency, &unit);
+  if (0 < value &&
+      value <= DBL_MAX) { // Good value (not overflow, underflow, etc).
+    if (strcmp(unit, "MHz") == 0) {
+      value = value * 1.0E+6;
+    } else if (strcmp(unit, "GHz") == 0) {
+      value = value * 1.0E+9;
+    } else if (strcmp(unit, "THz") == 0) {
+      value = value * 1.0E+12;
+    } else { // Wrong unit.
+      return result;
+    }
+    result = value;
+  }
+  return result;
+
+} // func __kmp_parse_cpu_frequency
+
+void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
+  struct kmp_cpuid buf;
+  int max_arg;
+  int log_per_phy;
+#ifdef KMP_DEBUG
+  int cflush_size;
+#endif
+
+  p->initialized = 1;
+
+  p->sse2 = 1; // Assume SSE2 by default.
+
+  __kmp_x86_cpuid(0, 0, &buf);
+
+  KA_TRACE(trace_level,
+           ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n", 0,
+            buf.eax, buf.ebx, buf.ecx, buf.edx));
+
+  max_arg = buf.eax;
+
+  p->apic_id = -1;
+
+  if (max_arg >= 1) {
+    int i;
+    kmp_uint32 t, data[4];
+
+    __kmp_x86_cpuid(1, 0, &buf);
+    KA_TRACE(trace_level,
+             ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n",
+              1, buf.eax, buf.ebx, buf.ecx, buf.edx));
+
+    {
+#define get_value(reg, lo, mask) (((reg) >> (lo)) & (mask))
+
+      p->signature = buf.eax;
+      p->family = get_value(buf.eax, 20, 0xff) + get_value(buf.eax, 8, 0x0f);
+      p->model =
+          (get_value(buf.eax, 16, 0x0f) << 4) + get_value(buf.eax, 4, 0x0f);
+      p->stepping = get_value(buf.eax, 0, 0x0f);
+
+#undef get_value
+
+      KA_TRACE(trace_level, (" family = %d, model = %d, stepping = %d\n",
+                             p->family, p->model, p->stepping));
+    }
+
+    for (t = buf.ebx, i = 0; i < 4; t >>= 8, ++i) {
+      data[i] = (t & 0xff);
+    }
+
+    p->sse2 = (buf.edx >> 26) & 1;
+
+#ifdef KMP_DEBUG
+
+    if ((buf.edx >> 4) & 1) {
+      /* TSC - Timestamp Counter Available */
+      KA_TRACE(trace_level, (" TSC"));
+    }
+    if ((buf.edx >> 8) & 1) {
+      /* CX8 - CMPXCHG8B Instruction Available */
+      KA_TRACE(trace_level, (" CX8"));
+    }
+    if ((buf.edx >> 9) & 1) {
+      /* APIC - Local APIC Present (multi-processor operation support */
+      KA_TRACE(trace_level, (" APIC"));
+    }
+    if ((buf.edx >> 15) & 1) {
+      /* CMOV - Conditional MOVe Instruction Available */
+      KA_TRACE(trace_level, (" CMOV"));
+    }
+    if ((buf.edx >> 18) & 1) {
+      /* PSN - Processor Serial Number Available */
+      KA_TRACE(trace_level, (" PSN"));
+    }
+    if ((buf.edx >> 19) & 1) {
+      /* CLFULSH - Cache Flush Instruction Available */
+      cflush_size =
+          data[1] * 8; /* Bits 15-08: CLFLUSH line size = 8 (64 bytes) */
+      KA_TRACE(trace_level, (" CLFLUSH(%db)", cflush_size));
+    }
+    if ((buf.edx >> 21) & 1) {
+      /* DTES - Debug Trace & EMON Store */
+      KA_TRACE(trace_level, (" DTES"));
+    }
+    if ((buf.edx >> 22) & 1) {
+      /* ACPI - ACPI Support Available */
+      KA_TRACE(trace_level, (" ACPI"));
+    }
+    if ((buf.edx >> 23) & 1) {
+      /* MMX - Multimedia Extensions */
+      KA_TRACE(trace_level, (" MMX"));
+    }
+    if ((buf.edx >> 25) & 1) {
+      /* SSE - SSE Instructions */
+      KA_TRACE(trace_level, (" SSE"));
+    }
+    if ((buf.edx >> 26) & 1) {
+      /* SSE2 - SSE2 Instructions */
+      KA_TRACE(trace_level, (" SSE2"));
+    }
+    if ((buf.edx >> 27) & 1) {
+      /* SLFSNP - Self-Snooping Cache */
+      KA_TRACE(trace_level, (" SLFSNP"));
+    }
+#endif /* KMP_DEBUG */
+
+    if ((buf.edx >> 28) & 1) {
+      /* Bits 23-16: Logical Processors per Physical Processor (1 for P4) */
+      log_per_phy = data[2];
+      p->apic_id = data[3]; /* Bits 31-24: Processor Initial APIC ID (X) */
+      KA_TRACE(trace_level, (" HT(%d TPUs)", log_per_phy));
+
+      if (log_per_phy > 1) {
+/* default to 1k FOR JT-enabled processors (4k on OS X*) */
+#if KMP_OS_DARWIN
+        p->cpu_stackoffset = 4 * 1024;
+#else
+        p->cpu_stackoffset = 1 * 1024;
+#endif
+      }
+
+      p->physical_id = __kmp_get_physical_id(log_per_phy, p->apic_id);
+      p->logical_id = __kmp_get_logical_id(log_per_phy, p->apic_id);
+    }
+#ifdef KMP_DEBUG
+    if ((buf.edx >> 29) & 1) {
+      /* ATHROTL - Automatic Throttle Control */
+      KA_TRACE(trace_level, (" ATHROTL"));
+    }
+    KA_TRACE(trace_level, (" ]\n"));
+
+    for (i = 2; i <= max_arg; ++i) {
+      __kmp_x86_cpuid(i, 0, &buf);
+      KA_TRACE(trace_level,
+               ("INFO: CPUID %d: EAX=0x%08X EBX=0x%08X ECX=0x%08X EDX=0x%08X\n",
+                i, buf.eax, buf.ebx, buf.ecx, buf.edx));
+    }
+#endif
+#if KMP_USE_ADAPTIVE_LOCKS
+    p->rtm = 0;
+    if (max_arg > 7) {
+      /* RTM bit CPUID.07:EBX, bit 11 */
+      __kmp_x86_cpuid(7, 0, &buf);
+      p->rtm = (buf.ebx >> 11) & 1;
+      KA_TRACE(trace_level, (" RTM"));
+    }
+#endif
+  }
+
+  { // Parse CPU brand string for frequency, saving the string for later.
+    int i;
+    kmp_cpuid_t *base = (kmp_cpuid_t *)&p->name[0];
+
+    // Get CPU brand string.
+    for (i = 0; i < 3; ++i) {
+      __kmp_x86_cpuid(0x80000002 + i, 0, base + i);
+    }
+    p->name[sizeof(p->name) - 1] = 0; // Just in case. ;-)
+    KA_TRACE(trace_level, ("cpu brand string: \"%s\"\n", &p->name[0]));
+
+    // Parse frequency.
+    p->frequency = __kmp_parse_frequency(strrchr(&p->name[0], ' '));
+    KA_TRACE(trace_level,
+             ("cpu frequency from brand string: %" KMP_UINT64_SPEC "\n",
+              p->frequency));
+  }
+}
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+void __kmp_expand_host_name(char *buffer, size_t size) {
+  KMP_DEBUG_ASSERT(size >= sizeof(unknown));
+#if KMP_OS_WINDOWS
+  {
+    DWORD s = size;
+
+    if (!GetComputerNameA(buffer, &s))
+      KMP_STRCPY_S(buffer, size, unknown);
+  }
+#else
+  buffer[size - 2] = 0;
+  if (gethostname(buffer, size) || buffer[size - 2] != 0)
+    KMP_STRCPY_S(buffer, size, unknown);
+#endif
+}
+
+/* Expand the meta characters in the filename:
+ * Currently defined characters are:
+ * %H the hostname
+ * %P the number of threads used.
+ * %I the unique identifier for this run.
+ */
+
+void __kmp_expand_file_name(char *result, size_t rlen, char *pattern) {
+  char *pos = result, *end = result + rlen - 1;
+  char buffer[256];
+  int default_cpu_width = 1;
+  int snp_result;
+
+  KMP_DEBUG_ASSERT(rlen > 0);
+  *end = 0;
+  {
+    int i;
+    for (i = __kmp_xproc; i >= 10; i /= 10, ++default_cpu_width)
+      ;
+  }
+
+  if (pattern != NULL) {
+    while (*pattern != '\0' && pos < end) {
+      if (*pattern != '%') {
+        *pos++ = *pattern++;
+      } else {
+        char *old_pattern = pattern;
+        int width = 1;
+        int cpu_width = default_cpu_width;
+
+        ++pattern;
+
+        if (*pattern >= '0' && *pattern <= '9') {
+          width = 0;
+          do {
+            width = (width * 10) + *pattern++ - '0';
+          } while (*pattern >= '0' && *pattern <= '9');
+          if (width < 0 || width > 1024)
+            width = 1;
+
+          cpu_width = width;
+        }
+
+        switch (*pattern) {
+        case 'H':
+        case 'h': {
+          __kmp_expand_host_name(buffer, sizeof(buffer));
+          KMP_STRNCPY(pos, buffer, end - pos + 1);
+          if (*end == 0) {
+            while (*pos)
+              ++pos;
+            ++pattern;
+          } else
+            pos = end;
+        } break;
+        case 'P':
+        case 'p': {
+          snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*d", cpu_width,
+                                    __kmp_dflt_team_nth);
+          if (snp_result >= 0 && snp_result <= end - pos) {
+            while (*pos)
+              ++pos;
+            ++pattern;
+          } else
+            pos = end;
+        } break;
+        case 'I':
+        case 'i': {
+          pid_t id = getpid();
+#if KMP_ARCH_X86_64 && defined(__MINGW32__)
+          snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*lld", width, id);
+#else
+          snp_result = KMP_SNPRINTF(pos, end - pos + 1, "%0*d", width, id);
+#endif
+          if (snp_result >= 0 && snp_result <= end - pos) {
+            while (*pos)
+              ++pos;
+            ++pattern;
+          } else
+            pos = end;
+          break;
+        }
+        case '%': {
+          *pos++ = '%';
+          ++pattern;
+          break;
+        }
+        default: {
+          *pos++ = '%';
+          pattern = old_pattern + 1;
+          break;
+        }
+        }
+      }
+    }
+    /* TODO: How do we get rid of this? */
+    if (*pattern != '\0')
+      KMP_FATAL(FileNameTooLong);
+  }
+
+  *pos = '\0';
+}
diff --git a/final/runtime/src/kmp_version.cpp b/final/runtime/src/kmp_version.cpp
new file mode 100644
index 0000000..7464d19
--- /dev/null
+++ b/final/runtime/src/kmp_version.cpp
@@ -0,0 +1,198 @@
+/*
+ * kmp_version.cpp
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_io.h"
+#include "kmp_version.h"
+
+// Replace with snapshot date YYYYMMDD for promotion build.
+#define KMP_VERSION_BUILD 20140926
+
+// Helper macros to convert value of macro to string literal.
+#define _stringer(x) #x
+#define stringer(x) _stringer(x)
+
+// Detect compiler.
+#if KMP_COMPILER_ICC
+#if __INTEL_COMPILER == 1010
+#define KMP_COMPILER "Intel(R) C++ Compiler 10.1"
+#elif __INTEL_COMPILER == 1100
+#define KMP_COMPILER "Intel(R) C++ Compiler 11.0"
+#elif __INTEL_COMPILER == 1110
+#define KMP_COMPILER "Intel(R) C++ Compiler 11.1"
+#elif __INTEL_COMPILER == 1200
+#define KMP_COMPILER "Intel(R) C++ Compiler 12.0"
+#elif __INTEL_COMPILER == 1210
+#define KMP_COMPILER "Intel(R) C++ Compiler 12.1"
+#elif __INTEL_COMPILER == 1300
+#define KMP_COMPILER "Intel(R) C++ Compiler 13.0"
+#elif __INTEL_COMPILER == 1310
+#define KMP_COMPILER "Intel(R) C++ Compiler 13.1"
+#elif __INTEL_COMPILER == 1400
+#define KMP_COMPILER "Intel(R) C++ Compiler 14.0"
+#elif __INTEL_COMPILER == 1410
+#define KMP_COMPILER "Intel(R) C++ Compiler 14.1"
+#elif __INTEL_COMPILER == 1500
+#define KMP_COMPILER "Intel(R) C++ Compiler 15.0"
+#elif __INTEL_COMPILER == 1600
+#define KMP_COMPILER "Intel(R) C++ Compiler 16.0"
+#elif __INTEL_COMPILER == 1700
+#define KMP_COMPILER "Intel(R) C++ Compiler 17.0"
+#elif __INTEL_COMPILER == 1800
+#define KMP_COMPILER "Intel(R) C++ Compiler 18.0"
+#elif __INTEL_COMPILER == 1900
+#define KMP_COMPILER "Intel(R) C++ Compiler 19.0"
+#elif __INTEL_COMPILER >= 9900
+#define KMP_COMPILER "Intel(R) C++ Compiler mainline"
+#endif
+#elif KMP_COMPILER_CLANG
+#define KMP_COMPILER                                                           \
+  "Clang " stringer(__clang_major__) "." stringer(__clang_minor__)
+#elif KMP_COMPILER_GCC
+#define KMP_COMPILER "GCC " stringer(__GNUC__) "." stringer(__GNUC_MINOR__)
+#elif KMP_COMPILER_MSVC
+#define KMP_COMPILER "MSVC " stringer(_MSC_FULL_VER)
+#endif
+#ifndef KMP_COMPILER
+#warning "Unknown compiler"
+#define KMP_COMPILER "unknown compiler"
+#endif
+
+// Detect librray type (perf, stub).
+#ifdef KMP_STUB
+#define KMP_LIB_TYPE "stub"
+#else
+#define KMP_LIB_TYPE "performance"
+#endif // KMP_LIB_TYPE
+
+// Detect link type (static, dynamic).
+#if KMP_DYNAMIC_LIB
+#define KMP_LINK_TYPE "dynamic"
+#else
+#define KMP_LINK_TYPE "static"
+#endif // KMP_LINK_TYPE
+
+// Finally, define strings.
+#define KMP_LIBRARY KMP_LIB_TYPE " library (" KMP_LINK_TYPE ")"
+#define KMP_COPYRIGHT ""
+
+int const __kmp_version_major = KMP_VERSION_MAJOR;
+int const __kmp_version_minor = KMP_VERSION_MINOR;
+int const __kmp_version_build = KMP_VERSION_BUILD;
+int const __kmp_openmp_version = 201611;
+
+/* Do NOT change the format of this string!  Intel(R) Thread Profiler checks for
+   a specific format some changes in the recognition routine there need to be
+   made before this is changed. */
+char const __kmp_copyright[] = KMP_VERSION_PREFIX KMP_LIBRARY
+    " ver. " stringer(KMP_VERSION_MAJOR) "." stringer(
+        KMP_VERSION_MINOR) "." stringer(KMP_VERSION_BUILD) " " KMP_COPYRIGHT;
+
+char const __kmp_version_copyright[] = KMP_VERSION_PREFIX KMP_COPYRIGHT;
+char const __kmp_version_lib_ver[] =
+    KMP_VERSION_PREFIX "version: " stringer(KMP_VERSION_MAJOR) "." stringer(
+        KMP_VERSION_MINOR) "." stringer(KMP_VERSION_BUILD);
+char const __kmp_version_lib_type[] =
+    KMP_VERSION_PREFIX "library type: " KMP_LIB_TYPE;
+char const __kmp_version_link_type[] =
+    KMP_VERSION_PREFIX "link type: " KMP_LINK_TYPE;
+char const __kmp_version_build_time[] = KMP_VERSION_PREFIX "build time: "
+                                                           "no_timestamp";
+#if KMP_MIC2
+char const __kmp_version_target_env[] =
+    KMP_VERSION_PREFIX "target environment: MIC2";
+#endif
+char const __kmp_version_build_compiler[] =
+    KMP_VERSION_PREFIX "build compiler: " KMP_COMPILER;
+
+// Called at serial initialization time.
+static int __kmp_version_1_printed = FALSE;
+
+void __kmp_print_version_1(void) {
+  if (__kmp_version_1_printed) {
+    return;
+  }
+  __kmp_version_1_printed = TRUE;
+
+#ifndef KMP_STUB
+  kmp_str_buf_t buffer;
+  __kmp_str_buf_init(&buffer);
+  // Print version strings skipping initial magic.
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_lib_type[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_link_type[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_build_time[KMP_VERSION_MAGIC_LEN]);
+#if KMP_MIC
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_target_env[KMP_VERSION_MAGIC_LEN]);
+#endif
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_build_compiler[KMP_VERSION_MAGIC_LEN]);
+#if defined(KMP_GOMP_COMPAT)
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_alt_comp[KMP_VERSION_MAGIC_LEN]);
+#endif /* defined(KMP_GOMP_COMPAT) */
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_omp_api[KMP_VERSION_MAGIC_LEN]);
+  __kmp_str_buf_print(&buffer, "%sdynamic error checking: %s\n",
+                      KMP_VERSION_PREF_STR,
+                      (__kmp_env_consistency_check ? "yes" : "no"));
+#ifdef KMP_DEBUG
+  for (int i = bs_plain_barrier; i < bs_last_barrier; ++i) {
+    __kmp_str_buf_print(
+        &buffer, "%s%s barrier branch bits: gather=%u, release=%u\n",
+        KMP_VERSION_PREF_STR, __kmp_barrier_type_name[i],
+        __kmp_barrier_gather_branch_bits[i],
+        __kmp_barrier_release_branch_bits[i]); // __kmp_str_buf_print
+  }
+  for (int i = bs_plain_barrier; i < bs_last_barrier; ++i) {
+    __kmp_str_buf_print(
+        &buffer, "%s%s barrier pattern: gather=%s, release=%s\n",
+        KMP_VERSION_PREF_STR, __kmp_barrier_type_name[i],
+        __kmp_barrier_pattern_name[__kmp_barrier_gather_pattern[i]],
+        __kmp_barrier_pattern_name
+            [__kmp_barrier_release_pattern[i]]); // __kmp_str_buf_print
+  }
+  __kmp_str_buf_print(&buffer, "%s\n",
+                      &__kmp_version_lock[KMP_VERSION_MAGIC_LEN]);
+#endif
+  __kmp_str_buf_print(
+      &buffer, "%sthread affinity support: %s\n", KMP_VERSION_PREF_STR,
+#if KMP_AFFINITY_SUPPORTED
+      (KMP_AFFINITY_CAPABLE()
+           ? (__kmp_affinity_type == affinity_none ? "not used" : "yes")
+           : "no")
+#else
+      "no"
+#endif
+          );
+  __kmp_printf("%s", buffer.str);
+  __kmp_str_buf_free(&buffer);
+  K_DIAG(1, ("KMP_VERSION is true\n"));
+#endif // KMP_STUB
+} // __kmp_print_version_1
+
+// Called at parallel initialization time.
+static int __kmp_version_2_printed = FALSE;
+
+void __kmp_print_version_2(void) {
+  if (__kmp_version_2_printed) {
+    return;
+  }
+  __kmp_version_2_printed = TRUE;
+} // __kmp_print_version_2
+
+// end of file //
diff --git a/final/runtime/src/kmp_version.h b/final/runtime/src/kmp_version.h
new file mode 100644
index 0000000..9e726b3
--- /dev/null
+++ b/final/runtime/src/kmp_version.h
@@ -0,0 +1,66 @@
+/*
+ * kmp_version.h -- version number for this release
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_VERSION_H
+#define KMP_VERSION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#ifndef KMP_VERSION_MAJOR
+#error KMP_VERSION_MAJOR macro is not defined.
+#endif
+#define KMP_VERSION_MINOR 0
+/* Using "magic" prefix in all the version strings is rather convenient to get
+   static version info from binaries by using standard utilities "strings" and
+   "grep", e. g.:
+        $ strings libomp.so | grep "@(#)"
+   gives clean list of all version strings in the library. Leading zero helps
+   to keep version string separate from printable characters which may occurs
+   just before version string. */
+#define KMP_VERSION_MAGIC_STR "\x00@(#) "
+#define KMP_VERSION_MAGIC_LEN 6 // Length of KMP_VERSION_MAGIC_STR.
+#define KMP_VERSION_PREF_STR "Intel(R) OMP "
+#define KMP_VERSION_PREFIX KMP_VERSION_MAGIC_STR KMP_VERSION_PREF_STR
+
+/* declare all the version string constants for KMP_VERSION env. variable */
+extern int const __kmp_version_major;
+extern int const __kmp_version_minor;
+extern int const __kmp_version_build;
+extern int const __kmp_openmp_version;
+extern char const
+    __kmp_copyright[]; // Old variable, kept for compatibility with ITC and ITP.
+extern char const __kmp_version_copyright[];
+extern char const __kmp_version_lib_ver[];
+extern char const __kmp_version_lib_type[];
+extern char const __kmp_version_link_type[];
+extern char const __kmp_version_build_time[];
+extern char const __kmp_version_target_env[];
+extern char const __kmp_version_build_compiler[];
+extern char const __kmp_version_alt_comp[];
+extern char const __kmp_version_omp_api[];
+// ??? extern char const __kmp_version_debug[];
+extern char const __kmp_version_lock[];
+extern char const __kmp_version_nested_stats_reporting[];
+extern char const __kmp_version_ftnstdcall[];
+extern char const __kmp_version_ftncdecl[];
+extern char const __kmp_version_ftnextra[];
+
+void __kmp_print_version_1(void);
+void __kmp_print_version_2(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif /* KMP_VERSION_H */
diff --git a/final/runtime/src/kmp_wait_release.cpp b/final/runtime/src/kmp_wait_release.cpp
new file mode 100644
index 0000000..7d12c74
--- /dev/null
+++ b/final/runtime/src/kmp_wait_release.cpp
@@ -0,0 +1,25 @@
+/*
+ * kmp_wait_release.cpp -- Wait/Release implementation
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_wait_release.h"
+
+void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag,
+                   int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+  if (final_spin)
+    __kmp_wait_template<kmp_flag_64, TRUE>(
+        this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
+  else
+    __kmp_wait_template<kmp_flag_64, FALSE>(
+        this_thr, flag USE_ITT_BUILD_ARG(itt_sync_obj));
+}
+
+void __kmp_release_64(kmp_flag_64 *flag) { __kmp_release_template(flag); }
diff --git a/final/runtime/src/kmp_wait_release.h b/final/runtime/src/kmp_wait_release.h
new file mode 100644
index 0000000..bb6bdf5
--- /dev/null
+++ b/final/runtime/src/kmp_wait_release.h
@@ -0,0 +1,932 @@
+/*
+ * kmp_wait_release.h -- Wait/Release implementation
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_WAIT_RELEASE_H
+#define KMP_WAIT_RELEASE_H
+
+#include "kmp.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#if OMPT_SUPPORT
+#include "ompt-specific.h"
+#endif
+
+/*!
+@defgroup WAIT_RELEASE Wait/Release operations
+
+The definitions and functions here implement the lowest level thread
+synchronizations of suspending a thread and awaking it. They are used to build
+higher level operations such as barriers and fork/join.
+*/
+
+/*!
+@ingroup WAIT_RELEASE
+@{
+*/
+
+/*!
+ * The flag_type describes the storage used for the flag.
+ */
+enum flag_type {
+  flag32, /**< 32 bit flags */
+  flag64, /**< 64 bit flags */
+  flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
+};
+
+/*!
+ * Base class for wait/release volatile flag
+ */
+template <typename P> class kmp_flag_native {
+  volatile P *loc;
+  flag_type t;
+
+public:
+  typedef P flag_t;
+  kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+  volatile P *get() { return loc; }
+  void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
+  void set(volatile P *new_loc) { loc = new_loc; }
+  flag_type get_type() { return t; }
+  P load() { return *loc; }
+  void store(P val) { *loc = val; }
+};
+
+/*!
+ * Base class for wait/release atomic flag
+ */
+template <typename P> class kmp_flag {
+  std::atomic<P>
+      *loc; /**< Pointer to the flag storage that is modified by another thread
+             */
+  flag_type t; /**< "Type" of the flag in loc */
+public:
+  typedef P flag_t;
+  kmp_flag(std::atomic<P> *p, flag_type ft) : loc(p), t(ft) {}
+  /*!
+   * @result the pointer to the actual flag
+   */
+  std::atomic<P> *get() { return loc; }
+  /*!
+   * @result void* pointer to the actual flag
+   */
+  void *get_void_p() { return RCAST(void *, loc); }
+  /*!
+   * @param new_loc in   set loc to point at new_loc
+   */
+  void set(std::atomic<P> *new_loc) { loc = new_loc; }
+  /*!
+   * @result the flag_type
+   */
+  flag_type get_type() { return t; }
+  /*!
+   * @result flag value
+   */
+  P load() { return loc->load(std::memory_order_acquire); }
+  /*!
+   * @param val the new flag value to be stored
+   */
+  void store(P val) { loc->store(val, std::memory_order_release); }
+  // Derived classes must provide the following:
+  /*
+  kmp_info_t * get_waiter(kmp_uint32 i);
+  kmp_uint32 get_num_waiters();
+  bool done_check();
+  bool done_check_val(P old_loc);
+  bool notdone_check();
+  P internal_release();
+  void suspend(int th_gtid);
+  void resume(int th_gtid);
+  P set_sleeping();
+  P unset_sleeping();
+  bool is_sleeping();
+  bool is_any_sleeping();
+  bool is_sleeping_val(P old_loc);
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished
+                    USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
+                    is_constrained);
+  */
+};
+
+#if OMPT_SUPPORT
+OMPT_NOINLINE
+static void __ompt_implicit_task_end(kmp_info_t *this_thr,
+                                     ompt_state_t ompt_state,
+                                     ompt_data_t *tId) {
+  int ds_tid = this_thr->th.th_info.ds.ds_tid;
+  if (ompt_state == ompt_state_wait_barrier_implicit) {
+    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+#if OMPT_OPTIONAL
+    void *codeptr = NULL;
+    if (ompt_enabled.ompt_callback_sync_region_wait) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
+          codeptr);
+    }
+    if (ompt_enabled.ompt_callback_sync_region) {
+      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
+          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
+          codeptr);
+    }
+#endif
+    if (!KMP_MASTER_TID(ds_tid)) {
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, tId, 0, ds_tid, ompt_task_implicit);
+      }
+      // return to idle state
+      this_thr->th.ompt_thread_info.state = ompt_state_idle;
+    } else {
+      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+  }
+}
+#endif
+
+/* Spin wait loop that first does pause/yield, then sleep. A thread that calls
+   __kmp_wait_*  must make certain that another thread calls __kmp_release
+   to wake it back up to prevent deadlocks!
+
+   NOTE: We may not belong to a team at this point.  */
+template <class C, int final_spin, bool cancellable = false,
+          bool sleepable = true>
+static inline bool
+__kmp_wait_template(kmp_info_t *this_thr,
+                    C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+  volatile void *spin = flag->get();
+#endif
+  kmp_uint32 spins;
+  int th_gtid;
+  int tasks_completed = FALSE;
+  int oversubscribed;
+#if !KMP_USE_MONITOR
+  kmp_uint64 poll_count;
+  kmp_uint64 hibernate_goal;
+#else
+  kmp_uint32 hibernate;
+#endif
+
+  KMP_FSYNC_SPIN_INIT(spin, NULL);
+  if (flag->done_check()) {
+    KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
+    return false;
+  }
+  th_gtid = this_thr->th.th_info.ds.ds_gtid;
+  if (cancellable) {
+    kmp_team_t *team = this_thr->th.th_team;
+    if (team && team->t.t_cancel_request == cancel_parallel)
+      return true;
+  }
+#if KMP_OS_UNIX
+  if (final_spin)
+    KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+#endif
+  KA_TRACE(20,
+           ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
+#if KMP_STATS_ENABLED
+  stats_state_e thread_state = KMP_GET_THREAD_STATE();
+#endif
+
+/* OMPT Behavior:
+THIS function is called from
+  __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
+            these have join / fork behavior
+
+       In these cases, we don't change the state or trigger events in THIS
+function.
+       Events are triggered in the calling code (__kmp_barrier):
+
+                state := ompt_state_overhead
+            barrier-begin
+            barrier-wait-begin
+                state := ompt_state_wait_barrier
+          call join-barrier-implementation (finally arrive here)
+          {}
+          call fork-barrier-implementation (finally arrive here)
+          {}
+                state := ompt_state_overhead
+            barrier-wait-end
+            barrier-end
+                state := ompt_state_work_parallel
+
+
+  __kmp_fork_barrier  (after thread creation, before executing implicit task)
+          call fork-barrier-implementation (finally arrive here)
+          {} // worker arrive here with state = ompt_state_idle
+
+
+  __kmp_join_barrier  (implicit barrier at end of parallel region)
+                state := ompt_state_barrier_implicit
+            barrier-begin
+            barrier-wait-begin
+          call join-barrier-implementation (finally arrive here
+final_spin=FALSE)
+          {
+          }
+  __kmp_fork_barrier  (implicit barrier at end of parallel region)
+          call fork-barrier-implementation (finally arrive here final_spin=TRUE)
+
+       Worker after task-team is finished:
+            barrier-wait-end
+            barrier-end
+            implicit-task-end
+            idle-begin
+                state := ompt_state_idle
+
+       Before leaving, if state = ompt_state_idle
+            idle-end
+                state := ompt_state_overhead
+*/
+#if OMPT_SUPPORT
+  ompt_state_t ompt_entry_state;
+  ompt_data_t *tId;
+  if (ompt_enabled.enabled) {
+    ompt_entry_state = this_thr->th.ompt_thread_info.state;
+    if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
+        KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
+      ompt_lw_taskteam_t *team =
+          this_thr->th.th_team->t.ompt_serialized_team_info;
+      if (team) {
+        tId = &(team->ompt_task_info.task_data);
+      } else {
+        tId = OMPT_CUR_TASK_DATA(this_thr);
+      }
+    } else {
+      tId = &(this_thr->th.ompt_thread_info.task_data);
+    }
+    if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
+                       this_thr->th.th_task_team == NULL)) {
+      // implicit task is done. Either no taskqueue, or task-team finished
+      __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
+    }
+  }
+#endif
+
+  KMP_INIT_YIELD(spins); // Setup for waiting
+
+  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
+      __kmp_pause_status == kmp_soft_paused) {
+#if KMP_USE_MONITOR
+// The worker threads cannot rely on the team struct existing at this point.
+// Use the bt values cached in the thread struct instead.
+#ifdef KMP_ADJUST_BLOCKTIME
+    if (__kmp_pause_status == kmp_soft_paused ||
+        (__kmp_zero_bt && !this_thr->th.th_team_bt_set))
+      // Force immediate suspend if not set by user and more threads than
+      // available procs
+      hibernate = 0;
+    else
+      hibernate = this_thr->th.th_team_bt_intervals;
+#else
+    hibernate = this_thr->th.th_team_bt_intervals;
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+    /* If the blocktime is nonzero, we want to make sure that we spin wait for
+       the entirety of the specified #intervals, plus up to one interval more.
+       This increment make certain that this thread doesn't go to sleep too
+       soon.  */
+    if (hibernate != 0)
+      hibernate++;
+
+    // Add in the current time value.
+    hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
+    KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
+                  th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
+                  hibernate - __kmp_global.g.g_time.dt.t_value));
+#else
+    if (__kmp_pause_status == kmp_soft_paused) {
+      // Force immediate suspend
+      hibernate_goal = KMP_NOW();
+    } else
+      hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
+    poll_count = 0;
+#endif // KMP_USE_MONITOR
+  }
+
+  oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
+  KMP_MB();
+
+  // Main wait spin loop
+  while (flag->notdone_check()) {
+    kmp_task_team_t *task_team = NULL;
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+      task_team = this_thr->th.th_task_team;
+      /* If the thread's task team pointer is NULL, it means one of 3 things:
+         1) A newly-created thread is first being released by
+         __kmp_fork_barrier(), and its task team has not been set up yet.
+         2) All tasks have been executed to completion.
+         3) Tasking is off for this region.  This could be because we are in a
+         serialized region (perhaps the outer one), or else tasking was manually
+         disabled (KMP_TASKING=0).  */
+      if (task_team != NULL) {
+        if (TCR_SYNC_4(task_team->tt.tt_active)) {
+          if (KMP_TASKING_ENABLED(task_team))
+            flag->execute_tasks(
+                this_thr, th_gtid, final_spin,
+                &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+          else
+            this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        } else {
+          KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
+#if OMPT_SUPPORT
+          // task-team is done now, other cases should be catched above
+          if (final_spin && ompt_enabled.enabled)
+            __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
+#endif
+          this_thr->th.th_task_team = NULL;
+          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+        }
+      } else {
+        this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
+      } // if
+    } // if
+
+    KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
+    if (TCR_4(__kmp_global.g.g_done)) {
+      if (__kmp_global.g.g_abort)
+        __kmp_abort_thread();
+      break;
+    }
+
+    // If we are oversubscribed, or have waited a bit (and
+    // KMP_LIBRARY=throughput), then yield
+    KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+
+#if KMP_STATS_ENABLED
+    // Check if thread has been signalled to idle state
+    // This indicates that the logical "join-barrier" has finished
+    if (this_thr->th.th_stats->isIdle() &&
+        KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
+      KMP_SET_THREAD_STATE(IDLE);
+      KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
+    }
+#endif
+    // Check if the barrier surrounding this wait loop has been cancelled
+    if (cancellable) {
+      kmp_team_t *team = this_thr->th.th_team;
+      if (team && team->t.t_cancel_request == cancel_parallel)
+        break;
+    }
+
+    // Don't suspend if KMP_BLOCKTIME is set to "infinite"
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+        __kmp_pause_status != kmp_soft_paused)
+      continue;
+
+    // Don't suspend if there is a likelihood of new tasks being spawned.
+    if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
+      continue;
+
+#if KMP_USE_MONITOR
+    // If we have waited a bit more, fall asleep
+    if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
+      continue;
+#else
+    if (KMP_BLOCKING(hibernate_goal, poll_count++))
+      continue;
+#endif
+    // Don't suspend if wait loop designated non-sleepable
+    // in template parameters
+    if (!sleepable)
+      continue;
+
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+        __kmp_pause_status != kmp_soft_paused)
+      continue;
+
+    KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
+
+#if KMP_OS_UNIX
+    if (final_spin)
+      KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
+#endif
+    flag->suspend(th_gtid);
+#if KMP_OS_UNIX
+    if (final_spin)
+      KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
+#endif
+
+    if (TCR_4(__kmp_global.g.g_done)) {
+      if (__kmp_global.g.g_abort)
+        __kmp_abort_thread();
+      break;
+    } else if (__kmp_tasking_mode != tskm_immediate_exec &&
+               this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
+      this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
+    }
+    // TODO: If thread is done with work and times out, disband/free
+  }
+
+#if OMPT_SUPPORT
+  ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
+  if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
+#if OMPT_OPTIONAL
+    if (final_spin) {
+      __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
+      ompt_exit_state = this_thr->th.ompt_thread_info.state;
+    }
+#endif
+    if (ompt_exit_state == ompt_state_idle) {
+      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
+    }
+  }
+#endif
+#if KMP_STATS_ENABLED
+  // If we were put into idle state, pop that off the state stack
+  if (KMP_GET_THREAD_STATE() == IDLE) {
+    KMP_POP_PARTITIONED_TIMER();
+    KMP_SET_THREAD_STATE(thread_state);
+    this_thr->th.th_stats->resetIdleFlag();
+  }
+#endif
+
+#if KMP_OS_UNIX
+  if (final_spin)
+    KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
+#endif
+  KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
+  if (cancellable) {
+    kmp_team_t *team = this_thr->th.th_team;
+    if (team && team->t.t_cancel_request == cancel_parallel) {
+      if (tasks_completed) {
+        // undo the previous decrement of unfinished_threads so that the
+        // thread can decrement at the join barrier with no problem
+        kmp_task_team_t *task_team = this_thr->th.th_task_team;
+        std::atomic<kmp_int32> *unfinished_threads =
+            &(task_team->tt.tt_unfinished_threads);
+        KMP_ATOMIC_INC(unfinished_threads);
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+/* Release any threads specified as waiting on the flag by releasing the flag
+   and resume the waiting thread if indicated by the sleep bit(s). A thread that
+   calls __kmp_wait_template must call this function to wake up the potentially
+   sleeping thread and prevent deadlocks!  */
+template <class C> static inline void __kmp_release_template(C *flag) {
+#ifdef KMP_DEBUG
+  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+  KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
+  KMP_DEBUG_ASSERT(flag->get());
+  KMP_FSYNC_RELEASING(flag->get_void_p());
+
+  flag->internal_release();
+
+  KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
+                 flag->load()));
+
+  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+    // Only need to check sleep stuff if infinite block time not set.
+    // Are *any* threads waiting on flag sleeping?
+    if (flag->is_any_sleeping()) {
+      for (unsigned int i = 0; i < flag->get_num_waiters(); ++i) {
+        // if sleeping waiter exists at i, sets current_waiter to i inside flag
+        kmp_info_t *waiter = flag->get_waiter(i);
+        if (waiter) {
+          int wait_gtid = waiter->th.th_info.ds.ds_gtid;
+          // Wake up thread if needed
+          KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep "
+                        "flag(%p) set\n",
+                        gtid, wait_gtid, flag->get()));
+          flag->resume(wait_gtid); // unsets flag's current_waiter when done
+        }
+      }
+    }
+  }
+}
+
+template <typename FlagType> struct flag_traits {};
+
+template <> struct flag_traits<kmp_uint32> {
+  typedef kmp_uint32 flag_t;
+  static const flag_type t = flag32;
+  static inline flag_t tcr(flag_t f) { return TCR_4(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR32(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND32(f, v);
+  }
+};
+
+template <> struct flag_traits<kmp_uint64> {
+  typedef kmp_uint64 flag_t;
+  static const flag_type t = flag64;
+  static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+  static inline flag_t test_then_add4(volatile flag_t *f) {
+    return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
+  }
+  static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_OR64(f, v);
+  }
+  static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
+    return KMP_TEST_THEN_AND64(f, v);
+  }
+};
+
+// Basic flag that does not use C11 Atomics
+template <typename FlagType>
+class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
+  typedef flag_traits<FlagType> traits_type;
+  FlagType checker; /**< Value to compare flag to to check if flag has been
+                       released. */
+  kmp_info_t
+      *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
+  kmp_uint32
+      num_waiting_threads; /**< Number of threads sleeping on this thread. */
+public:
+  kmp_basic_flag_native(volatile FlagType *p)
+      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+  kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
+      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+    waiting_threads[0] = thr;
+  }
+  kmp_basic_flag_native(volatile FlagType *p, FlagType c)
+      : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
+        num_waiting_threads(0) {}
+  /*!
+   * param i in   index into waiting_threads
+   * @result the thread that is waiting at index i
+   */
+  kmp_info_t *get_waiter(kmp_uint32 i) {
+    KMP_DEBUG_ASSERT(i < num_waiting_threads);
+    return waiting_threads[i];
+  }
+  /*!
+   * @result num_waiting_threads
+   */
+  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+  /*!
+   * @param thr in   the thread which is now waiting
+   *
+   * Insert a waiting thread at index 0.
+   */
+  void set_waiter(kmp_info_t *thr) {
+    waiting_threads[0] = thr;
+    num_waiting_threads = 1;
+  }
+  /*!
+   * @result true if the flag object has been released.
+   */
+  bool done_check() { return traits_type::tcr(*(this->get())) == checker; }
+  /*!
+   * @param old_loc in   old value of flag
+   * @result true if the flag's old value indicates it was released.
+   */
+  bool done_check_val(FlagType old_loc) { return old_loc == checker; }
+  /*!
+   * @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode
+   */
+  bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
+  /*!
+   * @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state.
+   */
+  void internal_release() {
+    (void)traits_type::test_then_add4((volatile FlagType *)this->get());
+  }
+  /*!
+   * @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s).
+   */
+  FlagType set_sleeping() {
+    return traits_type::test_then_or((volatile FlagType *)this->get(),
+                                     KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s).
+   */
+  FlagType unset_sleeping() {
+    return traits_type::test_then_and((volatile FlagType *)this->get(),
+                                      ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @param old_loc in   old value of flag
+   * Test whether there are threads sleeping on the flag's old value in old_loc.
+   */
+  bool is_sleeping_val(FlagType old_loc) {
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*!
+   * Test whether there are threads sleeping on the flag.
+   */
+  bool is_sleeping() { return is_sleeping_val(*(this->get())); }
+  bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
+  kmp_uint8 *get_stolen() { return NULL; }
+  enum barrier_type get_bt() { return bs_last_barrier; }
+};
+
+template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
+  typedef flag_traits<FlagType> traits_type;
+  FlagType checker; /**< Value to compare flag to to check if flag has been
+                       released. */
+  kmp_info_t
+      *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
+  kmp_uint32
+      num_waiting_threads; /**< Number of threads sleeping on this thread. */
+public:
+  kmp_basic_flag(std::atomic<FlagType> *p)
+      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+  kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
+      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+    waiting_threads[0] = thr;
+  }
+  kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
+      : kmp_flag<FlagType>(p, traits_type::t), checker(c),
+        num_waiting_threads(0) {}
+  /*!
+   * param i in   index into waiting_threads
+   * @result the thread that is waiting at index i
+   */
+  kmp_info_t *get_waiter(kmp_uint32 i) {
+    KMP_DEBUG_ASSERT(i < num_waiting_threads);
+    return waiting_threads[i];
+  }
+  /*!
+   * @result num_waiting_threads
+   */
+  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+  /*!
+   * @param thr in   the thread which is now waiting
+   *
+   * Insert a waiting thread at index 0.
+   */
+  void set_waiter(kmp_info_t *thr) {
+    waiting_threads[0] = thr;
+    num_waiting_threads = 1;
+  }
+  /*!
+   * @result true if the flag object has been released.
+   */
+  bool done_check() { return this->load() == checker; }
+  /*!
+   * @param old_loc in   old value of flag
+   * @result true if the flag's old value indicates it was released.
+   */
+  bool done_check_val(FlagType old_loc) { return old_loc == checker; }
+  /*!
+   * @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode
+   */
+  bool notdone_check() { return this->load() != checker; }
+  /*!
+   * @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state.
+   */
+  void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
+  /*!
+   * @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s).
+   */
+  FlagType set_sleeping() {
+    return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s).
+   */
+  FlagType unset_sleeping() {
+    return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @param old_loc in   old value of flag
+   * Test whether there are threads sleeping on the flag's old value in old_loc.
+   */
+  bool is_sleeping_val(FlagType old_loc) {
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*!
+   * Test whether there are threads sleeping on the flag.
+   */
+  bool is_sleeping() { return is_sleeping_val(this->load()); }
+  bool is_any_sleeping() { return is_sleeping_val(this->load()); }
+  kmp_uint8 *get_stolen() { return NULL; }
+  enum barrier_type get_bt() { return bs_last_barrier; }
+};
+
+class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
+public:
+  kmp_flag_32(std::atomic<kmp_uint32> *p) : kmp_basic_flag<kmp_uint32>(p) {}
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
+      : kmp_basic_flag<kmp_uint32>(p, thr) {}
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
+      : kmp_basic_flag<kmp_uint32>(p, c) {}
+  void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
+  void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_execute_tasks_32(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+  void wait(kmp_info_t *this_thr,
+            int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+    if (final_spin)
+      __kmp_wait_template<kmp_flag_32, TRUE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      __kmp_wait_template<kmp_flag_32, FALSE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  void release() { __kmp_release_template(this); }
+  flag_type get_ptr_type() { return flag32; }
+};
+
+class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64> {
+public:
+  kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native<kmp_uint64>(p) {}
+  kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
+      : kmp_basic_flag_native<kmp_uint64>(p, thr) {}
+  kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
+      : kmp_basic_flag_native<kmp_uint64>(p, c) {}
+  void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
+  void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_execute_tasks_64(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+  void wait(kmp_info_t *this_thr,
+            int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+    if (final_spin)
+      __kmp_wait_template<kmp_flag_64, TRUE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      __kmp_wait_template<kmp_flag_64, FALSE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  bool wait_cancellable_nosleep(kmp_info_t *this_thr,
+                                int final_spin
+                                    USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
+    bool retval = false;
+    if (final_spin)
+      retval = __kmp_wait_template<kmp_flag_64, TRUE, true, false>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      retval = __kmp_wait_template<kmp_flag_64, FALSE, true, false>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    return retval;
+  }
+  void release() { __kmp_release_template(this); }
+  flag_type get_ptr_type() { return flag64; }
+};
+
+// Hierarchical 64-bit on-core barrier instantiation
+class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
+  kmp_uint64 checker;
+  kmp_info_t *waiting_threads[1];
+  kmp_uint32 num_waiting_threads;
+  kmp_uint32
+      offset; /**< Portion of flag that is of interest for an operation. */
+  bool flag_switch; /**< Indicates a switch in flag location. */
+  enum barrier_type bt; /**< Barrier type. */
+  kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
+                           location. */
+#if USE_ITT_BUILD
+  void *
+      itt_sync_obj; /**< ITT object that must be passed to new flag location. */
+#endif
+  unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
+    return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
+  }
+
+public:
+  kmp_flag_oncore(volatile kmp_uint64 *p)
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
+        flag_switch(false) {}
+  kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
+        offset(idx), flag_switch(false) {}
+  kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
+                  enum barrier_type bar_t,
+                  kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
+        num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
+        this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
+  kmp_info_t *get_waiter(kmp_uint32 i) {
+    KMP_DEBUG_ASSERT(i < num_waiting_threads);
+    return waiting_threads[i];
+  }
+  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+  void set_waiter(kmp_info_t *thr) {
+    waiting_threads[0] = thr;
+    num_waiting_threads = 1;
+  }
+  bool done_check_val(kmp_uint64 old_loc) {
+    return byteref(&old_loc, offset) == checker;
+  }
+  bool done_check() { return done_check_val(*get()); }
+  bool notdone_check() {
+    // Calculate flag_switch
+    if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
+      flag_switch = true;
+    if (byteref(get(), offset) != 1 && !flag_switch)
+      return true;
+    else if (flag_switch) {
+      this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
+      kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go,
+                       (kmp_uint64)KMP_BARRIER_STATE_BUMP);
+      __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    return false;
+  }
+  void internal_release() {
+    // Other threads can write their own bytes simultaneously.
+    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+      byteref(get(), offset) = 1;
+    } else {
+      kmp_uint64 mask = 0;
+      byteref(&mask, offset) = 1;
+      KMP_TEST_THEN_OR64(get(), mask);
+    }
+  }
+  kmp_uint64 set_sleeping() {
+    return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
+  }
+  kmp_uint64 unset_sleeping() {
+    return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
+  }
+  bool is_sleeping_val(kmp_uint64 old_loc) {
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  bool is_sleeping() { return is_sleeping_val(*get()); }
+  bool is_any_sleeping() { return is_sleeping_val(*get()); }
+  void wait(kmp_info_t *this_thr, int final_spin) {
+    if (final_spin)
+      __kmp_wait_template<kmp_flag_oncore, TRUE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+    else
+      __kmp_wait_template<kmp_flag_oncore, FALSE>(
+          this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
+  }
+  void release() { __kmp_release_template(this); }
+  void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
+  void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_execute_tasks_oncore(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+  kmp_uint8 *get_stolen() { return NULL; }
+  enum barrier_type get_bt() { return bt; }
+  flag_type get_ptr_type() { return flag_oncore; }
+};
+
+// Used to wake up threads, volatile void* flag is usually the th_sleep_loc
+// associated with int gtid.
+static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
+  if (!flag)
+    return;
+
+  switch (RCAST(kmp_flag_64 *, CCAST(void *, flag))->get_type()) {
+  case flag32:
+    __kmp_resume_32(gtid, NULL);
+    break;
+  case flag64:
+    __kmp_resume_64(gtid, NULL);
+    break;
+  case flag_oncore:
+    __kmp_resume_oncore(gtid, NULL);
+    break;
+  }
+}
+
+/*!
+@}
+*/
+
+#endif // KMP_WAIT_RELEASE_H
diff --git a/final/runtime/src/kmp_wrapper_getpid.h b/final/runtime/src/kmp_wrapper_getpid.h
new file mode 100644
index 0000000..70db857
--- /dev/null
+++ b/final/runtime/src/kmp_wrapper_getpid.h
@@ -0,0 +1,75 @@
+/*
+ * kmp_wrapper_getpid.h -- getpid() declaration.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_WRAPPER_GETPID_H
+#define KMP_WRAPPER_GETPID_H
+
+#if KMP_OS_UNIX
+
+// On Unix-like systems (Linux* OS and OS X*) getpid() is declared in standard
+// headers.
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#if KMP_OS_DARWIN
+// OS X
+#define __kmp_gettid() syscall(SYS_thread_selfid)
+#elif KMP_OS_FREEBSD
+#include <pthread_np.h>
+#define __kmp_gettid() pthread_getthreadid_np()
+#elif KMP_OS_NETBSD
+#include <lwp.h>
+#define __kmp_gettid() _lwp_self()
+#elif defined(SYS_gettid)
+// Hopefully other Unix systems define SYS_gettid syscall for getting os thread
+// id
+#define __kmp_gettid() syscall(SYS_gettid)
+#else
+#warning No gettid found, use getpid instead
+#define __kmp_gettid() getpid()
+#endif
+
+#elif KMP_OS_WINDOWS
+
+// On Windows* OS _getpid() returns int (not pid_t) and is declared in
+// "process.h".
+#include <process.h>
+// Let us simulate Unix.
+#if KMP_MSVC_COMPAT
+typedef int pid_t;
+#endif
+#define getpid _getpid
+#define __kmp_gettid() GetCurrentThreadId()
+
+#else
+
+#error Unknown or unsupported OS.
+
+#endif
+
+/* TODO: All the libomp source code uses pid_t type for storing the result of
+   getpid(), it is good. But often it printed as "%d", that is not good, because
+   it ignores pid_t definition (may pid_t be longer that int?). It seems all pid
+   prints should be rewritten as:
+
+   printf( "%" KMP_UINT64_SPEC, (kmp_uint64) pid );
+
+   or (at least) as
+
+   printf( "%" KMP_UINT32_SPEC, (kmp_uint32) pid );
+
+   (kmp_uint32, kmp_uint64, KMP_UINT64_SPEC, and KMP_UNIT32_SPEC are defined in
+   "kmp_os.h".)  */
+
+#endif // KMP_WRAPPER_GETPID_H
+
+// end of file //
diff --git a/final/runtime/src/kmp_wrapper_malloc.h b/final/runtime/src/kmp_wrapper_malloc.h
new file mode 100644
index 0000000..a50387c
--- /dev/null
+++ b/final/runtime/src/kmp_wrapper_malloc.h
@@ -0,0 +1,196 @@
+/*
+ * kmp_wrapper_malloc.h -- Wrappers for memory allocation routines
+ *                         (malloc(), free(), and others).
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_WRAPPER_MALLOC_H
+#define KMP_WRAPPER_MALLOC_H
+
+/* This header serves for 3 purposes:
+   1. Declaring standard memory allocation rourines in OS-independent way.
+   2. Passing source location info through memory allocation wrappers.
+   3. Enabling native memory debugging capabilities.
+
+   1. Declaring standard memory allocation rourines in OS-independent way.
+   -----------------------------------------------------------------------
+   On Linux* OS, alloca() function is declared in <alloca.h> header, while on
+   Windows* OS there is no <alloca.h> header, function _alloca() (note
+   underscore!) is declared in <malloc.h>. This header eliminates these
+   differences, so client code incluiding "kmp_wrapper_malloc.h" can rely on
+   following routines:
+
+        malloc
+        calloc
+        realloc
+        free
+        alloca
+
+   in OS-independent way. It also enables memory tracking capabilities in debug
+   build. (Currently it is available only on Windows* OS.)
+
+   2. Passing source location info through memory allocation wrappers.
+   -------------------------------------------------------------------
+   Some tools may help debugging memory errors, for example, report memory
+   leaks. However, memory allocation wrappers may hinder source location.
+   For example:
+
+   void * aligned_malloc( int size ) {
+     void * ptr = malloc( size ); // All the memory leaks will be reported at
+                                  // this line.
+     // some adjustments...
+     return ptr;
+   };
+
+   ptr = aligned_malloc( size ); // Memory leak will *not* be detected here. :-(
+
+   To overcome the problem, information about original source location should
+   be passed through all the memory allocation wrappers, for example:
+
+   void * aligned_malloc( int size, char const * file, int line ) {
+     void * ptr = _malloc_dbg( size, file, line );
+     // some adjustments...
+     return ptr;
+   };
+   void * ptr = aligned_malloc( size, __FILE__, __LINE__ );
+
+   This is a good idea for debug, but passing additional arguments impacts
+   performance. Disabling extra arguments in release version of the software
+   introduces too many conditional compilation, which makes code unreadable.
+   This header defines few macros and functions facilitating it:
+
+   void * _aligned_malloc( int size KMP_SRC_LOC_DECL ) {
+     void * ptr = malloc_src_loc( size KMP_SRC_LOC_PARM );
+     // some adjustments...
+     return ptr;
+   };
+   #define aligned_malloc( size ) _aligned_malloc( (size) KMP_SRC_LOC_CURR )
+   // Use macro instead of direct call to function.
+
+   void * ptr = aligned_malloc( size );  // Bingo! Memory leak will be
+                                         // reported at this line.
+
+   3. Enabling native memory debugging capabilities.
+   -------------------------------------------------
+   Some platforms may offer memory debugging capabilities. For example, debug
+   version of Microsoft RTL tracks all memory allocations and can report memory
+   leaks. This header enables this, and makes report more useful (see "Passing
+   source location info through memory allocation wrappers").
+*/
+
+#include <stdlib.h>
+
+#include "kmp_os.h"
+
+// Include alloca() declaration.
+#if KMP_OS_WINDOWS
+#include <malloc.h> // Windows* OS: _alloca() declared in "malloc.h".
+#if KMP_MSVC_COMPAT
+#define alloca _alloca // Allow to use alloca() with no underscore.
+#endif
+#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD
+// Declared in "stdlib.h".
+#elif KMP_OS_UNIX
+#include <alloca.h> // Linux* OS and OS X*: alloc() declared in "alloca".
+#else
+#error Unknown or unsupported OS.
+#endif
+
+/* KMP_SRC_LOC_DECL -- Declaring source location paramemters, to be used in
+   function declaration.
+   KMP_SRC_LOC_PARM -- Source location paramemters, to be used to pass
+   parameters to underlying levels.
+   KMP_SRC_LOC_CURR -- Source location arguments describing current location,
+   to be used at top-level.
+
+   Typical usage:
+   void * _aligned_malloc( int size KMP_SRC_LOC_DECL ) {
+     // Note: Comma is missed before KMP_SRC_LOC_DECL.
+     KE_TRACE( 25, ( "called from %s:%d\n", KMP_SRC_LOC_PARM ) );
+     ...
+   }
+   #define aligned_malloc( size ) _aligned_malloc( (size) KMP_SRC_LOC_CURR )
+   // Use macro instead of direct call to function -- macro passes info
+   // about current source location to the func.
+*/
+#if KMP_DEBUG
+#define KMP_SRC_LOC_DECL , char const *_file_, int _line_
+#define KMP_SRC_LOC_PARM , _file_, _line_
+#define KMP_SRC_LOC_CURR , __FILE__, __LINE__
+#else
+#define KMP_SRC_LOC_DECL
+#define KMP_SRC_LOC_PARM
+#define KMP_SRC_LOC_CURR
+#endif // KMP_DEBUG
+
+/* malloc_src_loc() and free_src_loc() are pseudo-functions (really macros)
+   with accepts extra arguments (source location info) in debug mode. They
+   should be used in place of malloc() and free(), this allows enabling native
+   memory debugging capabilities (if any).
+
+   Typical usage:
+   ptr = malloc_src_loc( size KMP_SRC_LOC_PARM );
+   // Inside memory allocation wrapper, or
+   ptr = malloc_src_loc( size KMP_SRC_LOC_CURR );
+   // Outside of memory allocation wrapper.
+*/
+#define malloc_src_loc(args) _malloc_src_loc(args)
+#define free_src_loc(args) _free_src_loc(args)
+/* Depending on build mode (debug or release), malloc_src_loc is declared with
+   1 or 3 parameters, but calls to malloc_src_loc() are always the same:
+
+   ... malloc_src_loc( size KMP_SRC_LOC_PARM ); // or KMP_SRC_LOC_CURR
+
+   Compiler issues warning/error "too few arguments in macro invocation".
+   Declaring two macros, malloc_src_loc() and _malloc_src_loc(), overcomes the
+   problem. */
+
+#if KMP_DEBUG
+
+#if KMP_OS_WINDOWS && _DEBUG
+// KMP_DEBUG != _DEBUG. MS debug RTL is available only if _DEBUG is defined.
+
+// Windows* OS has native memory debugging capabilities. Enable them.
+
+#include <crtdbg.h>
+
+#define KMP_MEM_BLOCK _CLIENT_BLOCK
+#define malloc(size) _malloc_dbg((size), KMP_MEM_BLOCK, __FILE__, __LINE__)
+#define calloc(num, size)                                                      \
+  _calloc_dbg((num), (size), KMP_MEM_BLOCK, __FILE__, __LINE__)
+#define realloc(ptr, size)                                                     \
+  _realloc_dbg((ptr), (size), KMP_MEM_BLOCK, __FILE__, __LINE__)
+#define free(ptr) _free_dbg((ptr), KMP_MEM_BLOCK)
+
+#define _malloc_src_loc(size, file, line)                                      \
+  _malloc_dbg((size), KMP_MEM_BLOCK, (file), (line))
+#define _free_src_loc(ptr, file, line) _free_dbg((ptr), KMP_MEM_BLOCK)
+
+#else
+
+// Linux* OS, OS X*, or non-debug Windows* OS.
+
+#define _malloc_src_loc(size, file, line) malloc((size))
+#define _free_src_loc(ptr, file, line) free((ptr))
+
+#endif
+
+#else
+
+// In release build malloc_src_loc() and free_src_loc() do not have extra
+// parameters.
+#define _malloc_src_loc(size) malloc((size))
+#define _free_src_loc(ptr) free((ptr))
+
+#endif // KMP_DEBUG
+
+#endif // KMP_WRAPPER_MALLOC_H
+
+// end of file //
diff --git a/final/runtime/src/libomp.rc.var b/final/runtime/src/libomp.rc.var
new file mode 100644
index 0000000..958cd04
--- /dev/null
+++ b/final/runtime/src/libomp.rc.var
@@ -0,0 +1,69 @@
+// libomp.rc.var
+
+//
+////===----------------------------------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+//
+
+#include "winresrc.h"
+#include "kmp_config.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US    // English (U.S.) resources
+#pragma code_page(1252)
+
+VS_VERSION_INFO VERSIONINFO
+    // Parts of FILEVERSION and PRODUCTVERSION are 16-bit fields, entire build date yyyymmdd
+    // does not fit into one version part, so we need to split it into yyyy and mmdd:
+    FILEVERSION    @LIBOMP_VERSION_MAJOR@,@LIBOMP_VERSION_MINOR@,@LIBOMP_VERSION_BUILD_YEAR@,@LIBOMP_VERSION_BUILD_MONTH_DAY@
+    PRODUCTVERSION @LIBOMP_VERSION_MAJOR@,@LIBOMP_VERSION_MINOR@,@LIBOMP_VERSION_BUILD_YEAR@,@LIBOMP_VERSION_BUILD_MONTH_DAY@
+    FILEFLAGSMASK  VS_FFI_FILEFLAGSMASK
+    FILEFLAGS      0
+#if KMP_DEBUG
+        | VS_FF_DEBUG
+#endif
+#if @LIBOMP_VERSION_BUILD@ == 0
+        | VS_FF_PRIVATEBUILD | VS_FF_PRERELEASE
+#endif
+    FILEOS          VOS_NT_WINDOWS32    // Windows* Server* 2003, XP*, 2000, or NT*
+    FILETYPE        VFT_DLL
+    BEGIN
+        BLOCK "StringFileInfo"
+        BEGIN
+            BLOCK "040904b0"            // U.S. English, Unicode (0x04b0 == 1200)
+            BEGIN
+
+                // FileDescription and LegalCopyright should be short.
+                VALUE "FileDescription",  "LLVM* OpenMP* Runtime Library\0"
+                // Following values may be relatively long.
+                VALUE "CompanyName",      "LLVM\0"
+                // VALUE "LegalTrademarks",  "\0"  // Not used for now.
+                VALUE "ProductName",      "LLVM* OpenMP* Runtime Library\0"
+                VALUE "ProductVersion",   "@LIBOMP_VERSION_MAJOR@.@LIBOMP_VERSION_MINOR@\0"
+                VALUE "FileVersion",      "@LIBOMP_VERSION_BUILD@\0"
+                VALUE "InternalName",     "@LIBOMP_LIB_FILE@\0"
+                VALUE "OriginalFilename", "@LIBOMP_LIB_FILE@\0"
+                VALUE "Comments",
+                    "LLVM* OpenMP* @LIBOMP_LEGAL_TYPE@ Library "
+                    "version @LIBOMP_VERSION_MAJOR@.@LIBOMP_VERSION_MINOR@.@LIBOMP_VERSION_BUILD@ "
+                    "for @LIBOMP_LEGAL_ARCH@ architecture built on @LIBOMP_BUILD_DATE@.\0"
+#if @LIBOMP_VERSION_BUILD@ == 0
+                    VALUE "PrivateBuild",
+                        "This is a development build.\0"
+#endif
+                // VALUE "SpecialBuild",     "\0"    // Not used for now.
+
+            END
+        END
+        BLOCK "VarFileInfo"
+        BEGIN
+            VALUE "Translation", 1033, 1200
+            // 1033 -- U.S. English, 1200 -- Unicode
+        END
+    END
+
+// end of file //
diff --git a/final/runtime/src/ompt-event-specific.h b/final/runtime/src/ompt-event-specific.h
new file mode 100644
index 0000000..da6a0e4
--- /dev/null
+++ b/final/runtime/src/ompt-event-specific.h
@@ -0,0 +1,106 @@
+/******************************************************************************
+ * File: ompt-event-specific.h
+ *
+ * Description:
+ *
+ *   specify which of the OMPT events are implemented by this runtime system
+ *   and the level of their implementation by a runtime system.
+ *****************************************************************************/
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT_EVENT_SPECIFIC_H__
+#define __OMPT_EVENT_SPECIFIC_H__
+
+#define _ompt_tokenpaste_helper(x, y) x##y
+#define _ompt_tokenpaste(x, y) _ompt_tokenpaste_helper(x, y)
+#define ompt_event_implementation_status(e) _ompt_tokenpaste(e, _implemented)
+
+/*----------------------------------------------------------------------------
+ | Specify whether an event may occur or not, and whether event callbacks
+ | never, sometimes, or always occur.
+ |
+ | The values for these constants are defined in section 6.1.2 of
+ | the OMPT TR. They are exposed to tools through ompt_set_callback.
+ +--------------------------------------------------------------------------*/
+
+#define ompt_event_UNIMPLEMENTED ompt_set_never
+#define ompt_event_MAY_CONVENIENT ompt_set_sometimes
+#define ompt_event_MAY_ALWAYS ompt_set_always
+
+#if OMPT_OPTIONAL
+#define ompt_event_MAY_ALWAYS_OPTIONAL ompt_event_MAY_ALWAYS
+#else
+#define ompt_event_MAY_ALWAYS_OPTIONAL ompt_event_UNIMPLEMENTED
+#endif
+
+/*----------------------------------------------------------------------------
+ | Mandatory Events
+ +--------------------------------------------------------------------------*/
+
+#define ompt_callback_thread_begin_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_thread_end_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_parallel_begin_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_parallel_end_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_task_create_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_task_schedule_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_target_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_data_op_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_target_submit_implemented ompt_event_UNIMPLEMENTED
+
+#define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS
+
+#define ompt_callback_device_initialize_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_device_finalize_implemented ompt_event_UNIMPLEMENTED
+
+#define ompt_callback_device_load_implemented ompt_event_UNIMPLEMENTED
+#define ompt_callback_device_unload_implemented ompt_event_UNIMPLEMENTED
+
+/*----------------------------------------------------------------------------
+ | Optional Events
+ +--------------------------------------------------------------------------*/
+
+#define ompt_callback_sync_region_wait_implemented                             \
+  ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_mutex_released_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_dependences_implemented                             \
+  ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_task_dependence_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_work_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_master_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_target_map_implemented ompt_event_UNIMPLEMENTED
+
+#define ompt_callback_sync_region_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_lock_init_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_lock_destroy_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_mutex_acquire_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_mutex_acquired_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+#define ompt_callback_nest_lock_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_flush_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL
+
+#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED
+
+#define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED
+
+#endif
diff --git a/final/runtime/src/ompt-general.cpp b/final/runtime/src/ompt-general.cpp
new file mode 100644
index 0000000..00bf606
--- /dev/null
+++ b/final/runtime/src/ompt-general.cpp
@@ -0,0 +1,732 @@
+/*
+ * ompt-general.cpp -- OMPT implementation of interface functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/*****************************************************************************
+ * system include files
+ ****************************************************************************/
+
+#include <assert.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if KMP_OS_UNIX
+#include <dlfcn.h>
+#endif
+
+/*****************************************************************************
+ * ompt include files
+ ****************************************************************************/
+
+#include "ompt-specific.cpp"
+
+/*****************************************************************************
+ * macros
+ ****************************************************************************/
+
+#define ompt_get_callback_success 1
+#define ompt_get_callback_failure 0
+
+#define no_tool_present 0
+
+#define OMPT_API_ROUTINE static
+
+#ifndef OMPT_STR_MATCH
+#define OMPT_STR_MATCH(haystack, needle) (!strcasecmp(haystack, needle))
+#endif
+
+/*****************************************************************************
+ * types
+ ****************************************************************************/
+
+typedef struct {
+  const char *state_name;
+  ompt_state_t state_id;
+} ompt_state_info_t;
+
+typedef struct {
+  const char *name;
+  kmp_mutex_impl_t id;
+} kmp_mutex_impl_info_t;
+
+enum tool_setting_e {
+  omp_tool_error,
+  omp_tool_unset,
+  omp_tool_disabled,
+  omp_tool_enabled
+};
+
+/*****************************************************************************
+ * global variables
+ ****************************************************************************/
+
+ompt_callbacks_active_t ompt_enabled;
+
+ompt_state_info_t ompt_state_info[] = {
+#define ompt_state_macro(state, code) {#state, state},
+    FOREACH_OMPT_STATE(ompt_state_macro)
+#undef ompt_state_macro
+};
+
+kmp_mutex_impl_info_t kmp_mutex_impl_info[] = {
+#define kmp_mutex_impl_macro(name, id) {#name, name},
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+};
+
+ompt_callbacks_internal_t ompt_callbacks;
+
+static ompt_start_tool_result_t *ompt_start_tool_result = NULL;
+
+/*****************************************************************************
+ * forward declarations
+ ****************************************************************************/
+
+static ompt_interface_fn_t ompt_fn_lookup(const char *s);
+
+OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void);
+
+/*****************************************************************************
+ * initialization and finalization (private operations)
+ ****************************************************************************/
+
+typedef ompt_start_tool_result_t *(*ompt_start_tool_t)(unsigned int,
+                                                       const char *);
+
+#if KMP_OS_DARWIN
+
+// While Darwin supports weak symbols, the library that wishes to provide a new
+// implementation has to link against this runtime which defeats the purpose
+// of having tools that are agnostic of the underlying runtime implementation.
+//
+// Fortunately, the linker includes all symbols of an executable in the global
+// symbol table by default so dlsym() even finds static implementations of
+// ompt_start_tool. For this to work on Linux, -Wl,--export-dynamic needs to be
+// passed when building the application which we don't want to rely on.
+
+static ompt_start_tool_result_t *ompt_tool_darwin(unsigned int omp_version,
+                                                  const char *runtime_version) {
+  ompt_start_tool_result_t *ret = NULL;
+  // Search symbol in the current address space.
+  ompt_start_tool_t start_tool =
+      (ompt_start_tool_t)dlsym(RTLD_DEFAULT, "ompt_start_tool");
+  if (start_tool) {
+    ret = start_tool(omp_version, runtime_version);
+  }
+  return ret;
+}
+
+#elif OMPT_HAVE_WEAK_ATTRIBUTE
+
+// On Unix-like systems that support weak symbols the following implementation
+// of ompt_start_tool() will be used in case no tool-supplied implementation of
+// this function is present in the address space of a process.
+
+_OMP_EXTERN OMPT_WEAK_ATTRIBUTE ompt_start_tool_result_t *
+ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
+  ompt_start_tool_result_t *ret = NULL;
+  // Search next symbol in the current address space. This can happen if the
+  // runtime library is linked before the tool. Since glibc 2.2 strong symbols
+  // don't override weak symbols that have been found before unless the user
+  // sets the environment variable LD_DYNAMIC_WEAK.
+  ompt_start_tool_t next_tool =
+      (ompt_start_tool_t)dlsym(RTLD_NEXT, "ompt_start_tool");
+  if (next_tool) {
+    ret = next_tool(omp_version, runtime_version);
+  }
+  return ret;
+}
+
+#elif OMPT_HAVE_PSAPI
+
+// On Windows, the ompt_tool_windows function is used to find the
+// ompt_start_tool symbol across all modules loaded by a process. If
+// ompt_start_tool is found, ompt_start_tool's return value is used to
+// initialize the tool. Otherwise, NULL is returned and OMPT won't be enabled.
+
+#include <psapi.h>
+#pragma comment(lib, "psapi.lib")
+
+// The number of loaded modules to start enumeration with EnumProcessModules()
+#define NUM_MODULES 128
+
+static ompt_start_tool_result_t *
+ompt_tool_windows(unsigned int omp_version, const char *runtime_version) {
+  int i;
+  DWORD needed, new_size;
+  HMODULE *modules;
+  HANDLE process = GetCurrentProcess();
+  modules = (HMODULE *)malloc(NUM_MODULES * sizeof(HMODULE));
+  ompt_start_tool_t ompt_tool_p = NULL;
+
+#if OMPT_DEBUG
+  printf("ompt_tool_windows(): looking for ompt_start_tool\n");
+#endif
+  if (!EnumProcessModules(process, modules, NUM_MODULES * sizeof(HMODULE),
+                          &needed)) {
+    // Regardless of the error reason use the stub initialization function
+    free(modules);
+    return NULL;
+  }
+  // Check if NUM_MODULES is enough to list all modules
+  new_size = needed / sizeof(HMODULE);
+  if (new_size > NUM_MODULES) {
+#if OMPT_DEBUG
+    printf("ompt_tool_windows(): resize buffer to %d bytes\n", needed);
+#endif
+    modules = (HMODULE *)realloc(modules, needed);
+    // If resizing failed use the stub function.
+    if (!EnumProcessModules(process, modules, needed, &needed)) {
+      free(modules);
+      return NULL;
+    }
+  }
+  for (i = 0; i < new_size; ++i) {
+    (FARPROC &)ompt_tool_p = GetProcAddress(modules[i], "ompt_start_tool");
+    if (ompt_tool_p) {
+#if OMPT_DEBUG
+      TCHAR modName[MAX_PATH];
+      if (GetModuleFileName(modules[i], modName, MAX_PATH))
+        printf("ompt_tool_windows(): ompt_start_tool found in module %s\n",
+               modName);
+#endif
+      free(modules);
+      return (*ompt_tool_p)(omp_version, runtime_version);
+    }
+#if OMPT_DEBUG
+    else {
+      TCHAR modName[MAX_PATH];
+      if (GetModuleFileName(modules[i], modName, MAX_PATH))
+        printf("ompt_tool_windows(): ompt_start_tool not found in module %s\n",
+               modName);
+    }
+#endif
+  }
+  free(modules);
+  return NULL;
+}
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+
+static ompt_start_tool_result_t *
+ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
+  ompt_start_tool_result_t *ret = NULL;
+  ompt_start_tool_t start_tool = NULL;
+#if KMP_OS_WINDOWS
+  // Cannot use colon to describe a list of absolute paths on Windows
+  const char *sep = ";";
+#else
+  const char *sep = ":";
+#endif
+
+#if KMP_OS_DARWIN
+  // Try in the current address space
+  ret = ompt_tool_darwin(omp_version, runtime_version);
+#elif OMPT_HAVE_WEAK_ATTRIBUTE
+  ret = ompt_start_tool(omp_version, runtime_version);
+#elif OMPT_HAVE_PSAPI
+  ret = ompt_tool_windows(omp_version, runtime_version);
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+  if (ret)
+    return ret;
+
+  // Try tool-libraries-var ICV
+  const char *tool_libs = getenv("OMP_TOOL_LIBRARIES");
+  if (tool_libs) {
+    char *libs = __kmp_str_format("%s", tool_libs);
+    char *buf;
+    char *fname = __kmp_str_token(libs, sep, &buf);
+    while (fname) {
+#if KMP_OS_UNIX
+      void *h = dlopen(fname, RTLD_LAZY);
+      if (h) {
+        start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
+#elif KMP_OS_WINDOWS
+      HMODULE h = LoadLibrary(fname);
+      if (h) {
+        start_tool = (ompt_start_tool_t)GetProcAddress(h, "ompt_start_tool");
+#else
+#error Activation of OMPT is not supported on this platform.
+#endif
+        if (start_tool && (ret = (*start_tool)(omp_version, runtime_version)))
+          break;
+      }
+      fname = __kmp_str_token(NULL, sep, &buf);
+    }
+    __kmp_str_free(&libs);
+  }
+  return ret;
+}
+
+void ompt_pre_init() {
+  //--------------------------------------------------
+  // Execute the pre-initialization logic only once.
+  //--------------------------------------------------
+  static int ompt_pre_initialized = 0;
+
+  if (ompt_pre_initialized)
+    return;
+
+  ompt_pre_initialized = 1;
+
+  //--------------------------------------------------
+  // Use a tool iff a tool is enabled and available.
+  //--------------------------------------------------
+  const char *ompt_env_var = getenv("OMP_TOOL");
+  tool_setting_e tool_setting = omp_tool_error;
+
+  if (!ompt_env_var || !strcmp(ompt_env_var, ""))
+    tool_setting = omp_tool_unset;
+  else if (OMPT_STR_MATCH(ompt_env_var, "disabled"))
+    tool_setting = omp_tool_disabled;
+  else if (OMPT_STR_MATCH(ompt_env_var, "enabled"))
+    tool_setting = omp_tool_enabled;
+
+#if OMPT_DEBUG
+  printf("ompt_pre_init(): tool_setting = %d\n", tool_setting);
+#endif
+  switch (tool_setting) {
+  case omp_tool_disabled:
+    break;
+
+  case omp_tool_unset:
+  case omp_tool_enabled:
+
+    //--------------------------------------------------
+    // Load tool iff specified in environment variable
+    //--------------------------------------------------
+    ompt_start_tool_result =
+        ompt_try_start_tool(__kmp_openmp_version, ompt_get_runtime_version());
+
+    memset(&ompt_enabled, 0, sizeof(ompt_enabled));
+    break;
+
+  case omp_tool_error:
+    fprintf(stderr, "Warning: OMP_TOOL has invalid value \"%s\".\n"
+                    "  legal values are (NULL,\"\",\"disabled\","
+                    "\"enabled\").\n",
+            ompt_env_var);
+    break;
+  }
+#if OMPT_DEBUG
+  printf("ompt_pre_init(): ompt_enabled = %d\n", ompt_enabled);
+#endif
+}
+
+extern "C" int omp_get_initial_device(void);
+
+void ompt_post_init() {
+  //--------------------------------------------------
+  // Execute the post-initialization logic only once.
+  //--------------------------------------------------
+  static int ompt_post_initialized = 0;
+
+  if (ompt_post_initialized)
+    return;
+
+  ompt_post_initialized = 1;
+
+  //--------------------------------------------------
+  // Initialize the tool if so indicated.
+  //--------------------------------------------------
+  if (ompt_start_tool_result) {
+    ompt_enabled.enabled = !!ompt_start_tool_result->initialize(
+        ompt_fn_lookup, omp_get_initial_device(), &(ompt_start_tool_result->tool_data));
+
+    if (!ompt_enabled.enabled) {
+      // tool not enabled, zero out the bitmap, and done
+      memset(&ompt_enabled, 0, sizeof(ompt_enabled));
+      return;
+    }
+
+    kmp_info_t *root_thread = ompt_get_thread();
+
+    ompt_set_thread_state(root_thread, ompt_state_overhead);
+
+    if (ompt_enabled.ompt_callback_thread_begin) {
+      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
+          ompt_thread_initial, __ompt_get_thread_data_internal());
+    }
+    ompt_data_t *task_data;
+    ompt_data_t *parallel_data;
+    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
+    if (ompt_enabled.ompt_callback_implicit_task) {
+      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
+    }
+
+    ompt_set_thread_state(root_thread, ompt_state_work_serial);
+  }
+}
+
+void ompt_fini() {
+  if (ompt_enabled.enabled) {
+    ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data));
+  }
+
+  memset(&ompt_enabled, 0, sizeof(ompt_enabled));
+}
+
+/*****************************************************************************
+ * interface operations
+ ****************************************************************************/
+
+/*****************************************************************************
+ * state
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_enumerate_states(int current_state, int *next_state,
+                                           const char **next_state_name) {
+  const static int len = sizeof(ompt_state_info) / sizeof(ompt_state_info_t);
+  int i = 0;
+
+  for (i = 0; i < len - 1; i++) {
+    if (ompt_state_info[i].state_id == current_state) {
+      *next_state = ompt_state_info[i + 1].state_id;
+      *next_state_name = ompt_state_info[i + 1].state_name;
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+OMPT_API_ROUTINE int ompt_enumerate_mutex_impls(int current_impl,
+                                                int *next_impl,
+                                                const char **next_impl_name) {
+  const static int len =
+      sizeof(kmp_mutex_impl_info) / sizeof(kmp_mutex_impl_info_t);
+  int i = 0;
+  for (i = 0; i < len - 1; i++) {
+    if (kmp_mutex_impl_info[i].id != current_impl)
+      continue;
+    *next_impl = kmp_mutex_impl_info[i + 1].id;
+    *next_impl_name = kmp_mutex_impl_info[i + 1].name;
+    return 1;
+  }
+  return 0;
+}
+
+/*****************************************************************************
+ * callbacks
+ ****************************************************************************/
+
+OMPT_API_ROUTINE ompt_set_result_t ompt_set_callback(ompt_callbacks_t which,
+                                       ompt_callback_t callback) {
+  switch (which) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+  case event_name:                                                             \
+    if (ompt_event_implementation_status(event_name)) {                        \
+      ompt_callbacks.ompt_callback(event_name) = (callback_type)callback;      \
+      ompt_enabled.event_name = (callback != 0);                               \
+    }                                                                          \
+    if (callback)                                                              \
+      return ompt_event_implementation_status(event_name);                     \
+    else                                                                       \
+      return ompt_set_always;
+
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+  default:
+    return ompt_set_error;
+  }
+}
+
+OMPT_API_ROUTINE int ompt_get_callback(ompt_callbacks_t which,
+                                       ompt_callback_t *callback) {
+  if (!ompt_enabled.enabled)
+    return ompt_get_callback_failure;
+
+  switch (which) {
+
+#define ompt_event_macro(event_name, callback_type, event_id)                  \
+  case event_name:                                                             \
+    if (ompt_event_implementation_status(event_name)) {                        \
+      ompt_callback_t mycb =                                                   \
+          (ompt_callback_t)ompt_callbacks.ompt_callback(event_name);           \
+      if (ompt_enabled.event_name && mycb) {                                   \
+        *callback = mycb;                                                      \
+        return ompt_get_callback_success;                                      \
+      }                                                                        \
+    }                                                                          \
+    return ompt_get_callback_failure;
+
+    FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+
+  default:
+    return ompt_get_callback_failure;
+  }
+}
+
+/*****************************************************************************
+ * parallel regions
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_parallel_info(int ancestor_level,
+                                            ompt_data_t **parallel_data,
+                                            int *team_size) {
+  if (!ompt_enabled.enabled)
+    return 0;
+  return __ompt_get_parallel_info_internal(ancestor_level, parallel_data,
+                                           team_size);
+}
+
+OMPT_API_ROUTINE int ompt_get_state(ompt_wait_id_t *wait_id) {
+  if (!ompt_enabled.enabled)
+    return ompt_state_work_serial;
+  int thread_state = __ompt_get_state_internal(wait_id);
+
+  if (thread_state == ompt_state_undefined) {
+    thread_state = ompt_state_work_serial;
+  }
+
+  return thread_state;
+}
+
+/*****************************************************************************
+ * tasks
+ ****************************************************************************/
+
+OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void) {
+  if (!ompt_enabled.enabled)
+    return NULL;
+  return __ompt_get_thread_data_internal();
+}
+
+OMPT_API_ROUTINE int ompt_get_task_info(int ancestor_level, int *type,
+                                        ompt_data_t **task_data,
+                                        ompt_frame_t **task_frame,
+                                        ompt_data_t **parallel_data,
+                                        int *thread_num) {
+  if (!ompt_enabled.enabled)
+    return 0;
+  return __ompt_get_task_info_internal(ancestor_level, type, task_data,
+                                       task_frame, parallel_data, thread_num);
+}
+
+OMPT_API_ROUTINE int ompt_get_task_memory(void **addr, size_t *size,
+                                          int block) {
+  return __ompt_get_task_memory_internal(addr, size, block);
+}
+
+/*****************************************************************************
+ * num_procs
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_num_procs(void) {
+  // copied from kmp_ftn_entry.h (but modified: OMPT can only be called when
+  // runtime is initialized)
+  return __kmp_avail_proc;
+}
+
+/*****************************************************************************
+ * places
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_num_places(void) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  return __kmp_affinity_num_masks;
+#endif
+}
+
+OMPT_API_ROUTINE int ompt_get_place_proc_ids(int place_num, int ids_size,
+                                             int *ids) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  int i, count;
+  int tmp_ids[ids_size];
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
+    return 0;
+  /* TODO: Is this safe for asynchronous call from signal handler during runtime
+   * shutdown? */
+  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
+  count = 0;
+  KMP_CPU_SET_ITERATE(i, mask) {
+    if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
+        (!KMP_CPU_ISSET(i, mask))) {
+      continue;
+    }
+    if (count < ids_size)
+      tmp_ids[count] = i;
+    count++;
+  }
+  if (ids_size >= count) {
+    for (i = 0; i < count; i++) {
+      ids[i] = tmp_ids[i];
+    }
+  }
+  return count;
+#endif
+}
+
+OMPT_API_ROUTINE int ompt_get_place_num(void) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return -1;
+#else
+  if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
+    return -1;
+
+  int gtid;
+  kmp_info_t *thread;
+  if (!KMP_AFFINITY_CAPABLE())
+    return -1;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread == NULL || thread->th.th_current_place < 0)
+    return -1;
+  return thread->th.th_current_place;
+#endif
+}
+
+OMPT_API_ROUTINE int ompt_get_partition_place_nums(int place_nums_size,
+                                                   int *place_nums) {
+// copied from kmp_ftn_entry.h (but modified)
+#if !KMP_AFFINITY_SUPPORTED
+  return 0;
+#else
+  if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
+    return 0;
+
+  int i, gtid, place_num, first_place, last_place, start, end;
+  kmp_info_t *thread;
+  if (!KMP_AFFINITY_CAPABLE())
+    return 0;
+  gtid = __kmp_entry_gtid();
+  thread = __kmp_thread_from_gtid(gtid);
+  if (thread == NULL)
+    return 0;
+  first_place = thread->th.th_first_place;
+  last_place = thread->th.th_last_place;
+  if (first_place < 0 || last_place < 0)
+    return 0;
+  if (first_place <= last_place) {
+    start = first_place;
+    end = last_place;
+  } else {
+    start = last_place;
+    end = first_place;
+  }
+  if (end - start <= place_nums_size)
+    for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
+      place_nums[i] = place_num;
+    }
+  return end - start + 1;
+#endif
+}
+
+/*****************************************************************************
+ * places
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_proc_id(void) {
+  if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
+    return -1;
+#if KMP_OS_LINUX
+  return sched_getcpu();
+#elif KMP_OS_WINDOWS
+  PROCESSOR_NUMBER pn;
+  GetCurrentProcessorNumberEx(&pn);
+  return 64 * pn.Group + pn.Number;
+#else
+  return -1;
+#endif
+}
+
+/*****************************************************************************
+ * compatability
+ ****************************************************************************/
+
+/*
+ * Currently unused function
+OMPT_API_ROUTINE int ompt_get_ompt_version() { return OMPT_VERSION; }
+*/
+
+/*****************************************************************************
+* application-facing API
+ ****************************************************************************/
+
+/*----------------------------------------------------------------------------
+ | control
+ ---------------------------------------------------------------------------*/
+
+int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg) {
+
+  if (ompt_enabled.enabled) {
+    if (ompt_enabled.ompt_callback_control_tool) {
+      return ompt_callbacks.ompt_callback(ompt_callback_control_tool)(
+          command, modifier, arg, OMPT_LOAD_RETURN_ADDRESS(__kmp_entry_gtid()));
+    } else {
+      return -1;
+    }
+  } else {
+    return -2;
+  }
+}
+
+/*****************************************************************************
+ * misc
+ ****************************************************************************/
+
+OMPT_API_ROUTINE uint64_t ompt_get_unique_id(void) {
+  return __ompt_get_unique_id_internal();
+}
+
+OMPT_API_ROUTINE void ompt_finalize_tool(void) { __kmp_internal_end_atexit(); }
+
+/*****************************************************************************
+ * Target
+ ****************************************************************************/
+
+OMPT_API_ROUTINE int ompt_get_target_info(uint64_t *device_num,
+                                          ompt_id_t *target_id,
+                                          ompt_id_t *host_op_id) {
+  return 0; // thread is not in a target region
+}
+
+OMPT_API_ROUTINE int ompt_get_num_devices(void) {
+  return 1; // only one device (the current device) is available
+}
+
+/*****************************************************************************
+ * API inquiry for tool
+ ****************************************************************************/
+
+static ompt_interface_fn_t ompt_fn_lookup(const char *s) {
+
+#define ompt_interface_fn(fn)                                                  \
+  fn##_t fn##_f = fn;                                                          \
+  if (strcmp(s, #fn) == 0)                                                     \
+    return (ompt_interface_fn_t)fn##_f;
+
+  FOREACH_OMPT_INQUIRY_FN(ompt_interface_fn)
+
+  return (ompt_interface_fn_t)0;
+}
diff --git a/final/runtime/src/ompt-internal.h b/final/runtime/src/ompt-internal.h
new file mode 100644
index 0000000..5a6beaf
--- /dev/null
+++ b/final/runtime/src/ompt-internal.h
@@ -0,0 +1,126 @@
+/*
+ * ompt-internal.h - header of OMPT internal data structures
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT_INTERNAL_H__
+#define __OMPT_INTERNAL_H__
+
+#include "ompt-event-specific.h"
+#include "omp-tools.h"
+
+#define OMPT_VERSION 1
+
+#define _OMP_EXTERN extern "C"
+
+#define OMPT_INVOKER(x)                                                        \
+  ((x == fork_context_gnu) ? ompt_parallel_invoker_program                     \
+                           : ompt_parallel_invoker_runtime)
+
+#define ompt_callback(e) e##_callback
+
+typedef struct ompt_callbacks_internal_s {
+#define ompt_event_macro(event, callback, eventid)                             \
+  callback ompt_callback(event);
+
+  FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_callbacks_internal_t;
+
+typedef struct ompt_callbacks_active_s {
+  unsigned int enabled : 1;
+#define ompt_event_macro(event, callback, eventid) unsigned int event : 1;
+
+  FOREACH_OMPT_EVENT(ompt_event_macro)
+
+#undef ompt_event_macro
+} ompt_callbacks_active_t;
+
+#define TASK_TYPE_DETAILS_FORMAT(info)                                         \
+  ((info->td_flags.task_serial || info->td_flags.tasking_ser)                  \
+       ? ompt_task_undeferred                                                  \
+       : 0x0) |                                                                \
+      ((!(info->td_flags.tiedness)) ? ompt_task_untied : 0x0) |                \
+      (info->td_flags.final ? ompt_task_final : 0x0) |                         \
+      (info->td_flags.merged_if0 ? ompt_task_mergeable : 0x0)
+
+typedef struct {
+  ompt_frame_t frame;
+  ompt_data_t task_data;
+  struct kmp_taskdata *scheduling_parent;
+  int thread_num;
+  int ndeps;
+  ompt_dependence_t *deps;
+} ompt_task_info_t;
+
+typedef struct {
+  ompt_data_t parallel_data;
+  void *master_return_address;
+} ompt_team_info_t;
+
+typedef struct ompt_lw_taskteam_s {
+  ompt_team_info_t ompt_team_info;
+  ompt_task_info_t ompt_task_info;
+  int heap;
+  struct ompt_lw_taskteam_s *parent;
+} ompt_lw_taskteam_t;
+
+typedef struct {
+  ompt_data_t thread_data;
+  ompt_data_t task_data; /* stored here from implicit barrier-begin until
+                            implicit-task-end */
+  void *return_address; /* stored here on entry of runtime */
+  ompt_state_t state;
+  ompt_wait_id_t wait_id;
+  int ompt_task_yielded;
+  void *idle_frame;
+} ompt_thread_info_t;
+
+extern ompt_callbacks_internal_t ompt_callbacks;
+
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#if USE_FAST_MEMORY
+#define KMP_OMPT_DEPS_ALLOC __kmp_fast_allocate
+#define KMP_OMPT_DEPS_FREE __kmp_fast_free
+#else
+#define KMP_OMPT_DEPS_ALLOC __kmp_thread_malloc
+#define KMP_OMPT_DEPS_FREE __kmp_thread_free
+#endif
+#endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ompt_pre_init(void);
+void ompt_post_init(void);
+void ompt_fini(void);
+
+#define OMPT_GET_RETURN_ADDRESS(level) __builtin_return_address(level)
+#define OMPT_GET_FRAME_ADDRESS(level) __builtin_frame_address(level)
+
+int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg);
+
+extern ompt_callbacks_active_t ompt_enabled;
+
+#if KMP_OS_WINDOWS
+#define UNLIKELY(x) (x)
+#define OMPT_NOINLINE __declspec(noinline)
+#else
+#define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#define OMPT_NOINLINE __attribute__((noinline))
+#endif
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif
diff --git a/final/runtime/src/ompt-specific.cpp b/final/runtime/src/ompt-specific.cpp
new file mode 100644
index 0000000..63153d2
--- /dev/null
+++ b/final/runtime/src/ompt-specific.cpp
@@ -0,0 +1,505 @@
+/*
+ * ompt-specific.cpp -- OMPT internal functions
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//******************************************************************************
+// include files
+//******************************************************************************
+
+#include "kmp.h"
+#include "ompt-specific.h"
+
+#if KMP_OS_UNIX
+#include <dlfcn.h>
+#endif
+
+#if KMP_OS_WINDOWS
+#define THREAD_LOCAL __declspec(thread)
+#else
+#define THREAD_LOCAL __thread
+#endif
+
+#define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE
+
+//******************************************************************************
+// macros
+//******************************************************************************
+
+#define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info
+
+#define OMPT_THREAD_ID_BITS 16
+
+//******************************************************************************
+// private operations
+//******************************************************************************
+
+//----------------------------------------------------------
+// traverse the team and task hierarchy
+// note: __ompt_get_teaminfo and __ompt_get_task_info_object
+//       traverse the hierarchy similarly and need to be
+//       kept consistent
+//----------------------------------------------------------
+
+ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size) {
+  kmp_info_t *thr = ompt_get_thread();
+
+  if (thr) {
+    kmp_team *team = thr->th.th_team;
+    if (team == NULL)
+      return NULL;
+
+    ompt_lw_taskteam_t *next_lwt = LWT_FROM_TEAM(team), *lwt = NULL;
+
+    while (depth > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && team) {
+        if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          team = team->t.t_parent;
+          if (team) {
+            next_lwt = LWT_FROM_TEAM(team);
+          }
+        }
+      }
+
+      depth--;
+    }
+
+    if (lwt) {
+      // lightweight teams have one task
+      if (size)
+        *size = 1;
+
+      // return team info for lightweight team
+      return &lwt->ompt_team_info;
+    } else if (team) {
+      // extract size from heavyweight team
+      if (size)
+        *size = team->t.t_nproc;
+
+      // return team info for heavyweight team
+      return &team->t.ompt_team_info;
+    }
+  }
+
+  return NULL;
+}
+
+ompt_task_info_t *__ompt_get_task_info_object(int depth) {
+  ompt_task_info_t *info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
+
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+    while (depth > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          taskdata = taskdata->td_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      depth--;
+    }
+
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+    }
+  }
+
+  return info;
+}
+
+ompt_task_info_t *__ompt_get_scheduling_taskinfo(int depth) {
+  ompt_task_info_t *info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
+
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+
+    while (depth > 0) {
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        // first try scheduling parent (for explicit task scheduling)
+        if (taskdata->ompt_task_info.scheduling_parent) {
+          taskdata = taskdata->ompt_task_info.scheduling_parent;
+        } else if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          // then go for implicit tasks
+          taskdata = taskdata->td_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      depth--;
+    }
+
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+    }
+  }
+
+  return info;
+}
+
+//******************************************************************************
+// interface operations
+//******************************************************************************
+
+//----------------------------------------------------------
+// thread support
+//----------------------------------------------------------
+
+ompt_data_t *__ompt_get_thread_data_internal() {
+  if (__kmp_get_gtid() >= 0) {
+    kmp_info_t *thread = ompt_get_thread();
+    if (thread == NULL)
+      return NULL;
+    return &(thread->th.ompt_thread_info.thread_data);
+  }
+  return NULL;
+}
+
+//----------------------------------------------------------
+// state support
+//----------------------------------------------------------
+
+void __ompt_thread_assign_wait_id(void *variable) {
+  kmp_info_t *ti = ompt_get_thread();
+
+  if (ti)
+    ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t)(uintptr_t)variable;
+}
+
+int __ompt_get_state_internal(ompt_wait_id_t *omp_wait_id) {
+  kmp_info_t *ti = ompt_get_thread();
+
+  if (ti) {
+    if (omp_wait_id)
+      *omp_wait_id = ti->th.ompt_thread_info.wait_id;
+    return ti->th.ompt_thread_info.state;
+  }
+  return ompt_state_undefined;
+}
+
+//----------------------------------------------------------
+// parallel region support
+//----------------------------------------------------------
+
+int __ompt_get_parallel_info_internal(int ancestor_level,
+                                      ompt_data_t **parallel_data,
+                                      int *team_size) {
+  if (__kmp_get_gtid() >= 0) {
+    ompt_team_info_t *info;
+    if (team_size) {
+      info = __ompt_get_teaminfo(ancestor_level, team_size);
+    } else {
+      info = __ompt_get_teaminfo(ancestor_level, NULL);
+    }
+    if (parallel_data) {
+      *parallel_data = info ? &(info->parallel_data) : NULL;
+    }
+    return info ? 2 : 0;
+  } else {
+    return 0;
+  }
+}
+
+//----------------------------------------------------------
+// lightweight task team support
+//----------------------------------------------------------
+
+void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
+                             ompt_data_t *ompt_pid, void *codeptr) {
+  // initialize parallel_data with input, return address to parallel_data on
+  // exit
+  lwt->ompt_team_info.parallel_data = *ompt_pid;
+  lwt->ompt_team_info.master_return_address = codeptr;
+  lwt->ompt_task_info.task_data.value = 0;
+  lwt->ompt_task_info.frame.enter_frame = ompt_data_none;
+  lwt->ompt_task_info.frame.exit_frame = ompt_data_none;
+  lwt->ompt_task_info.scheduling_parent = NULL;
+  lwt->ompt_task_info.deps = NULL;
+  lwt->ompt_task_info.ndeps = 0;
+  lwt->heap = 0;
+  lwt->parent = 0;
+}
+
+void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                             int on_heap) {
+  ompt_lw_taskteam_t *link_lwt = lwt;
+  if (thr->th.th_team->t.t_serialized >
+      1) { // we already have a team, so link the new team and swap values
+    if (on_heap) { // the lw_taskteam cannot stay on stack, allocate it on heap
+      link_lwt =
+          (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
+    }
+    link_lwt->heap = on_heap;
+
+    // would be swap in the (on_stack) case.
+    ompt_team_info_t tmp_team = lwt->ompt_team_info;
+    link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
+    *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
+
+    ompt_task_info_t tmp_task = lwt->ompt_task_info;
+    link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
+
+    // link the taskteam into the list of taskteams:
+    ompt_lw_taskteam_t *my_parent =
+        thr->th.th_team->t.ompt_serialized_team_info;
+    link_lwt->parent = my_parent;
+    thr->th.th_team->t.ompt_serialized_team_info = link_lwt;
+  } else {
+    // this is the first serialized team, so we just store the values in the
+    // team and drop the taskteam-object
+    *OMPT_CUR_TEAM_INFO(thr) = lwt->ompt_team_info;
+    *OMPT_CUR_TASK_INFO(thr) = lwt->ompt_task_info;
+  }
+}
+
+void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
+  ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
+  if (lwtask) {
+    thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;
+
+    ompt_team_info_t tmp_team = lwtask->ompt_team_info;
+    lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
+    *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
+
+    ompt_task_info_t tmp_task = lwtask->ompt_task_info;
+    lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
+
+    if (lwtask->heap) {
+      __kmp_free(lwtask);
+      lwtask = NULL;
+    }
+  }
+  //    return lwtask;
+}
+
+//----------------------------------------------------------
+// task support
+//----------------------------------------------------------
+
+int __ompt_get_task_info_internal(int ancestor_level, int *type,
+                                  ompt_data_t **task_data,
+                                  ompt_frame_t **task_frame,
+                                  ompt_data_t **parallel_data,
+                                  int *thread_num) {
+  if (__kmp_get_gtid() < 0)
+    return 0;
+
+  if (ancestor_level < 0)
+    return 0;
+
+  // copied from __ompt_get_scheduling_taskinfo
+  ompt_task_info_t *info = NULL;
+  ompt_team_info_t *team_info = NULL;
+  kmp_info_t *thr = ompt_get_thread();
+  int level = ancestor_level;
+
+  if (thr) {
+    kmp_taskdata_t *taskdata = thr->th.th_current_task;
+    if (taskdata == NULL)
+      return 0;
+    kmp_team *team = thr->th.th_team, *prev_team = NULL;
+    if (team == NULL)
+      return 0;
+    ompt_lw_taskteam_t *lwt = NULL,
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team),
+                       *prev_lwt = NULL;
+
+    while (ancestor_level > 0) {
+      // needed for thread_num
+      prev_team = team;
+      prev_lwt = lwt;
+      // next lightweight team (if any)
+      if (lwt)
+        lwt = lwt->parent;
+
+      // next heavyweight team (if any) after
+      // lightweight teams are exhausted
+      if (!lwt && taskdata) {
+        // first try scheduling parent (for explicit task scheduling)
+        if (taskdata->ompt_task_info.scheduling_parent) {
+          taskdata = taskdata->ompt_task_info.scheduling_parent;
+        } else if (next_lwt) {
+          lwt = next_lwt;
+          next_lwt = NULL;
+        } else {
+          // then go for implicit tasks
+          taskdata = taskdata->td_parent;
+          if (team == NULL)
+            return 0;
+          team = team->t.t_parent;
+          if (taskdata) {
+            next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+          }
+        }
+      }
+      ancestor_level--;
+    }
+
+    if (lwt) {
+      info = &lwt->ompt_task_info;
+      team_info = &lwt->ompt_team_info;
+      if (type) {
+        *type = ompt_task_implicit;
+      }
+    } else if (taskdata) {
+      info = &taskdata->ompt_task_info;
+      team_info = &team->t.ompt_team_info;
+      if (type) {
+        if (taskdata->td_parent) {
+          *type = (taskdata->td_flags.tasktype ? ompt_task_explicit
+                                               : ompt_task_implicit) |
+                  TASK_TYPE_DETAILS_FORMAT(taskdata);
+        } else {
+          *type = ompt_task_initial;
+        }
+      }
+    }
+    if (task_data) {
+      *task_data = info ? &info->task_data : NULL;
+    }
+    if (task_frame) {
+      // OpenMP spec asks for the scheduling task to be returned.
+      *task_frame = info ? &info->frame : NULL;
+    }
+    if (parallel_data) {
+      *parallel_data = team_info ? &(team_info->parallel_data) : NULL;
+    }
+    if (thread_num) {
+      if (level == 0)
+        *thread_num = __kmp_get_tid();
+      else if (prev_lwt)
+        *thread_num = 0;
+      else
+        *thread_num = prev_team->t.t_master_tid;
+      //        *thread_num = team->t.t_master_tid;
+    }
+    return info ? 2 : 0;
+  }
+  return 0;
+}
+
+int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
+  if (blocknum != 0)
+    return 0; // support only a single block
+
+  kmp_info_t *thr = ompt_get_thread();
+  if (!thr)
+    return 0;
+
+  kmp_taskdata_t *taskdata = thr->th.th_current_task;
+  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
+
+  if (taskdata->td_flags.tasktype != TASK_EXPLICIT)
+    return 0; // support only explicit task
+
+  void *ret_addr;
+  int64_t ret_size = taskdata->td_size_alloc - sizeof(kmp_taskdata_t);
+
+  // kmp_task_t->data1 is an optional member
+  if (taskdata->td_flags.destructors_thunk)
+    ret_addr = &task->data1 + 1;
+  else
+    ret_addr = &task->part_id + 1;
+
+  ret_size -= (char *)(ret_addr) - (char *)(task);
+  if (ret_size < 0)
+    return 0;
+
+  *addr = ret_addr;
+  *size = ret_size;
+  return 1;
+}
+
+//----------------------------------------------------------
+// team support
+//----------------------------------------------------------
+
+void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid) {
+  team->t.ompt_team_info.parallel_data = ompt_pid;
+}
+
+//----------------------------------------------------------
+// misc
+//----------------------------------------------------------
+
+static uint64_t __ompt_get_unique_id_internal() {
+  static uint64_t thread = 1;
+  static THREAD_LOCAL uint64_t ID = 0;
+  if (ID == 0) {
+    uint64_t new_thread = KMP_TEST_THEN_INC64((kmp_int64 *)&thread);
+    ID = new_thread << (sizeof(uint64_t) * 8 - OMPT_THREAD_ID_BITS);
+  }
+  return ++ID;
+}
+
+ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt,
+                                           kmp_info_t *thr) {
+  if (bt == bs_forkjoin_barrier)
+    return ompt_sync_region_barrier_implicit;
+
+  if (bt != bs_plain_barrier)
+    return ompt_sync_region_barrier_implementation;
+
+  if (!thr->th.th_ident)
+    return ompt_sync_region_barrier;
+
+  kmp_int32 flags = thr->th.th_ident->flags;
+
+  if ((flags & KMP_IDENT_BARRIER_EXPL) != 0)
+    return ompt_sync_region_barrier_explicit;
+
+  if ((flags & KMP_IDENT_BARRIER_IMPL) != 0)
+    return ompt_sync_region_barrier_implicit;
+
+  return ompt_sync_region_barrier_implementation;
+}
diff --git a/final/runtime/src/ompt-specific.h b/final/runtime/src/ompt-specific.h
new file mode 100644
index 0000000..86fd928
--- /dev/null
+++ b/final/runtime/src/ompt-specific.h
@@ -0,0 +1,105 @@
+/*
+ * ompt-specific.h - header of OMPT internal functions implementation
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPT_SPECIFIC_H
+#define OMPT_SPECIFIC_H
+
+#include "kmp.h"
+
+/*****************************************************************************
+ * forward declarations
+ ****************************************************************************/
+
+void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid);
+void __ompt_thread_assign_wait_id(void *variable);
+
+void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                             int gtid, ompt_data_t *ompt_pid, void *codeptr);
+
+void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
+                             int on_heap);
+
+void __ompt_lw_taskteam_unlink(kmp_info_t *thr);
+
+ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size);
+
+ompt_task_info_t *__ompt_get_task_info_object(int depth);
+
+int __ompt_get_parallel_info_internal(int ancestor_level,
+                                      ompt_data_t **parallel_data,
+                                      int *team_size);
+
+int __ompt_get_task_info_internal(int ancestor_level, int *type,
+                                  ompt_data_t **task_data,
+                                  ompt_frame_t **task_frame,
+                                  ompt_data_t **parallel_data, int *thread_num);
+
+ompt_data_t *__ompt_get_thread_data_internal();
+
+/*
+ * Unused currently
+static uint64_t __ompt_get_get_unique_id_internal();
+*/
+
+ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type, kmp_info_t *);
+
+/*****************************************************************************
+ * macros
+ ****************************************************************************/
+
+#define OMPT_CUR_TASK_INFO(thr) (&(thr->th.th_current_task->ompt_task_info))
+#define OMPT_CUR_TASK_DATA(thr)                                                \
+  (&(thr->th.th_current_task->ompt_task_info.task_data))
+#define OMPT_CUR_TEAM_INFO(thr) (&(thr->th.th_team->t.ompt_team_info))
+#define OMPT_CUR_TEAM_DATA(thr)                                                \
+  (&(thr->th.th_team->t.ompt_team_info.parallel_data))
+
+#define OMPT_HAVE_WEAK_ATTRIBUTE KMP_HAVE_WEAK_ATTRIBUTE
+#define OMPT_HAVE_PSAPI KMP_HAVE_PSAPI
+#define OMPT_STR_MATCH(haystack, needle) __kmp_str_match(haystack, 0, needle)
+
+inline void *__ompt_load_return_address(int gtid) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  void *return_address = thr->th.ompt_thread_info.return_address;
+  thr->th.ompt_thread_info.return_address = NULL;
+  return return_address;
+}
+
+#define OMPT_STORE_RETURN_ADDRESS(gtid)                                        \
+  if (ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&              \
+      !__kmp_threads[gtid]->th.ompt_thread_info.return_address)                \
+  __kmp_threads[gtid]->th.ompt_thread_info.return_address =                    \
+      __builtin_return_address(0)
+#define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid)
+
+//******************************************************************************
+// inline functions
+//******************************************************************************
+
+inline kmp_info_t *ompt_get_thread_gtid(int gtid) {
+  return (gtid >= 0) ? __kmp_thread_from_gtid(gtid) : NULL;
+}
+
+inline kmp_info_t *ompt_get_thread() {
+  int gtid = __kmp_get_gtid();
+  return ompt_get_thread_gtid(gtid);
+}
+
+inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
+  thread->th.ompt_thread_info.state = state;
+}
+
+inline const char *ompt_get_runtime_version() {
+  return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
+}
+
+#endif
diff --git a/final/runtime/src/test-touch.c b/final/runtime/src/test-touch.c
new file mode 100644
index 0000000..71e05e7
--- /dev/null
+++ b/final/runtime/src/test-touch.c
@@ -0,0 +1,30 @@
+// test-touch.c //
+
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern double omp_get_wtime();
+extern int    omp_get_num_threads();
+extern int    omp_get_max_threads();
+#ifdef __cplusplus
+}
+#endif
+
+int main() {
+    omp_get_wtime();
+    omp_get_num_threads();
+    omp_get_max_threads();
+    return 0;
+}
+
+// end of file //
diff --git a/final/runtime/src/thirdparty/ittnotify/disable_warnings.h b/final/runtime/src/thirdparty/ittnotify/disable_warnings.h
new file mode 100644
index 0000000..6b06035
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/disable_warnings.h
@@ -0,0 +1,29 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+
+#pragma warning (disable: 593)   /* parameter "XXXX" was set but never used                 */
+#pragma warning (disable: 344)   /* typedef name has already been declared (with same type) */
+#pragma warning (disable: 174)   /* expression has no effect                                */
+#pragma warning (disable: 4127)  /* conditional expression is constant                      */
+#pragma warning (disable: 4306)  /* conversion from '?' to '?' of greater size              */
+
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if defined __INTEL_COMPILER
+
+#pragma warning (disable: 869)  /* parameter "XXXXX" was never referenced                  */
+#pragma warning (disable: 1418) /* external function definition with no prior declaration  */
+#pragma warning (disable: 1419) /* external declaration in primary source file             */
+
+#endif /* __INTEL_COMPILER */
diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify.h b/final/runtime/src/thirdparty/ittnotify/ittnotify.h
new file mode 100644
index 0000000..ed46cd7
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify.h
@@ -0,0 +1,4075 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ITTNOTIFY_H_
+#define _ITTNOTIFY_H_
+
+/**
+@file
+@brief Public User API functions and types
+@mainpage
+
+The ITT API is used to annotate a user's program with additional information
+that can be used by correctness and performance tools. The user inserts
+calls in their program. Those calls generate information that is collected
+at runtime, and used by Intel(R) Threading Tools.
+
+@section API Concepts
+The following general concepts are used throughout the API.
+
+@subsection Unicode Support
+Many API functions take character string arguments. On Windows, there
+are two versions of each such function. The function name is suffixed
+by W if Unicode support is enabled, and by A otherwise. Any API function
+that takes a character string argument adheres to this convention.
+
+@subsection Conditional Compilation
+Many users prefer having an option to modify ITT API code when linking it
+inside their runtimes. ITT API header file provides a mechanism to replace
+ITT API function names inside your code with empty strings. To do this,
+define the macros INTEL_NO_ITTNOTIFY_API during compilation and remove the
+static library from the linker script.
+
+@subsection Domains
+[see domains]
+Domains provide a way to separate notification for different modules or
+libraries in a program. Domains are specified by dotted character strings,
+e.g. TBB.Internal.Control.
+
+A mechanism (to be specified) is provided to enable and disable
+domains. By default, all domains are enabled.
+@subsection Named Entities and Instances
+Named entities (frames, regions, tasks, and markers) communicate
+information about the program to the analysis tools. A named entity often
+refers to a section of program code, or to some set of logical concepts
+that the programmer wants to group together.
+
+Named entities relate to the programmer's static view of the program. When
+the program actually executes, many instances of a given named entity
+may be created.
+
+The API annotations denote instances of named entities. The actual
+named entities are displayed using the analysis tools. In other words,
+the named entities come into existence when instances are created.
+
+Instances of named entities may have instance identifiers (IDs). Some
+API calls use instance identifiers to create relationships between
+different instances of named entities. Other API calls associate data
+with instances of named entities.
+
+Some named entities must always have instance IDs. In particular, regions
+and frames always have IDs. Task and markers need IDs only if the ID is
+needed in another API call (such as adding a relation or metadata).
+
+The lifetime of instance IDs is distinct from the lifetime of
+instances. This allows various relationships to be specified separate
+from the actual execution of instances. This flexibility comes at the
+expense of extra API calls.
+
+The same ID may not be reused for different instances, unless a previous
+[ref] __itt_id_destroy call for that ID has been issued.
+*/
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#  include "legacy/ittnotify.h"
+#endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup public Public API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup control Collection Control
+ * @ingroup public
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+/** @brief Detach collection */
+void ITTAPI __itt_detach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,  (void))
+ITT_STUBV(ITTAPI, void, resume, (void))
+ITT_STUBV(ITTAPI, void, detach, (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#define __itt_detach()
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} control group */
+/** @endcond */
+
+/**
+ * @defgroup threads Threads
+ * @ingroup public
+ * Give names to threads
+ * @{
+ */
+/**
+ * @brief Sets thread name of calling thread
+ * @param[in] name - name of thread
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_thread_set_nameA(const char    *name);
+void ITTAPI __itt_thread_set_nameW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thread_set_name     __itt_thread_set_nameW
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_thread_set_name     __itt_thread_set_nameA
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_thread_set_name(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name))
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA     ITTNOTIFY_VOID(thread_set_nameA)
+#define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA)
+#define __itt_thread_set_nameW     ITTNOTIFY_VOID(thread_set_nameW)
+#define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name     ITTNOTIFY_VOID(thread_set_name)
+#define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA(name)
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW(name)
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name(name)
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void ITTAPI __itt_thread_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, thread_ignore, (void))
+#define __itt_thread_ignore     ITTNOTIFY_VOID(thread_ignore)
+#define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thread_ignore()
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} threads group */
+
+/**
+ * @defgroup suppress Error suppression
+ * @ingroup public
+ * General behavior: application continues to run, but errors are suppressed
+ *
+ * @{
+ */
+
+/*****************************************************************//**
+ * @name group of functions used for error suppression in correctness tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask
+ */
+#define __itt_suppress_all_errors 0x7fffffff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from threading analysis)
+ */
+#define __itt_suppress_threading_errors 0x000000ff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from memory analysis)
+ */
+#define __itt_suppress_memory_errors 0x0000ff00
+
+/**
+ * @brief Start suppressing errors identified in mask on this thread
+ */
+void ITTAPI __itt_suppress_push(unsigned int mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
+#define __itt_suppress_push     ITTNOTIFY_VOID(suppress_push)
+#define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_push(mask)
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effects of the matching call to __itt_suppress_push
+ */
+void ITTAPI __itt_suppress_pop(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_pop, (void))
+#define __itt_suppress_pop     ITTNOTIFY_VOID(suppress_pop)
+#define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_pop()
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum __itt_suppress_mode {
+    __itt_unsuppress_range,
+    __itt_suppress_range
+} __itt_suppress_mode_t;
+
+/**
+ * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
+ */
+void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_mark_range     ITTNOTIFY_VOID(suppress_mark_range)
+#define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_mark_range(mask)
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If not matching
+ *        call is found, nothing is changed.
+ */
+void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_clear_range     ITTNOTIFY_VOID(suppress_clear_range)
+#define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_clear_range(mask)
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+/** @} suppress group */
+
+/**
+ * @defgroup sync Synchronization
+ * @ingroup public
+ * Indicate user-written synchronization code
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+@brief Name a synchronization object
+@param[in] addr       Handle for the synchronization object. You should
+use a real address to uniquely identify the synchronization object.
+@param[in] objtype    null-terminated object type string. If NULL is
+passed, the name will be "User Synchronization".
+@param[in] objname    null-terminated object name string. If NULL,
+no name will be assigned to the object.
+@param[in] attribute  one of [#__itt_attr_barrier, #__itt_attr_mutex]
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_createA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_create     __itt_sync_createW
+#  define __itt_sync_create_ptr __itt_sync_createW_ptr
+#else /* UNICODE */
+#  define __itt_sync_create     __itt_sync_createA
+#  define __itt_sync_create_ptr __itt_sync_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const char*    objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA     ITTNOTIFY_VOID(sync_createA)
+#define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA)
+#define __itt_sync_createW     ITTNOTIFY_VOID(sync_createW)
+#define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create     ITTNOTIFY_VOID(sync_create)
+#define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA(addr, objtype, objname, attribute)
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW(addr, objtype, objname, attribute)
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create(addr, objtype, objname, attribute)
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+@brief Rename a synchronization object
+
+You can use the rename call to assign or reassign a name to a given
+synchronization object.
+@param[in] addr  handle for the synchronization object.
+@param[in] name  null-terminated object name string.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_renameA(void *addr, const char    *name);
+void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_rename     __itt_sync_renameW
+#  define __itt_sync_rename_ptr __itt_sync_renameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_rename     __itt_sync_renameA
+#  define __itt_sync_rename_ptr __itt_sync_renameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_rename(void *addr, const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name))
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA     ITTNOTIFY_VOID(sync_renameA)
+#define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA)
+#define __itt_sync_renameW     ITTNOTIFY_VOID(sync_renameW)
+#define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename     ITTNOTIFY_VOID(sync_rename)
+#define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA(addr, name)
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW(addr, name)
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename(addr, name)
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ @brief Destroy a synchronization object.
+ @param addr Handle for the synchronization object.
+ */
+void ITTAPI __itt_sync_destroy(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr))
+#define __itt_sync_destroy     ITTNOTIFY_VOID(sync_destroy)
+#define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_destroy(addr)
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/*****************************************************************//**
+ * @name group of functions is used for performance measurement tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @brief Enter spin loop on user-defined sync object
+ */
+void ITTAPI __itt_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr))
+#define __itt_sync_prepare     ITTNOTIFY_VOID(sync_prepare)
+#define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_prepare(addr)
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Quit spin loop without acquiring spin object
+ */
+void ITTAPI __itt_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr))
+#define __itt_sync_cancel     ITTNOTIFY_VOID(sync_cancel)
+#define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_cancel(addr)
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void ITTAPI __itt_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr))
+#define __itt_sync_acquired     ITTNOTIFY_VOID(sync_acquired)
+#define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_acquired(addr)
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void ITTAPI __itt_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
+#define __itt_sync_releasing     ITTNOTIFY_VOID(sync_releasing)
+#define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_releasing(addr)
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/** @} sync group */
+
+/**************************************************************//**
+ * @name group of functions is used for correctness checking tools
+ ******************************************************************/
+/** @{ */
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_prepare(void* addr);
+ */
+void ITTAPI __itt_fsync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
+#define __itt_fsync_prepare     ITTNOTIFY_VOID(fsync_prepare)
+#define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_prepare(addr)
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_cancel(void *addr);
+ */
+void ITTAPI __itt_fsync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
+#define __itt_fsync_cancel     ITTNOTIFY_VOID(fsync_cancel)
+#define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_cancel(addr)
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_acquired(void *addr);
+ */
+void ITTAPI __itt_fsync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
+#define __itt_fsync_acquired     ITTNOTIFY_VOID(fsync_acquired)
+#define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_acquired(addr)
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_releasing(void* addr);
+ */
+void ITTAPI __itt_fsync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
+#define __itt_fsync_releasing     ITTNOTIFY_VOID(fsync_releasing)
+#define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_releasing(addr)
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/**
+ * @defgroup model Modeling by Intel(R) Parallel Advisor
+ * @ingroup public
+ * This is the subset of itt used for modeling by Intel(R) Parallel Advisor.
+ * This API is called ONLY using annotate.h, by "Annotation" macros
+ * the user places in their sources during the parallelism modeling steps.
+ *
+ * site_begin/end and task_begin/end take the address of handle variables,
+ * which are writeable by the API.  Handles must be 0 initialized prior
+ * to the first call to begin, or may cause a run-time failure.
+ * The handles are initialized in a multi-thread safe way by the API if
+ * the handle is 0.  The commonly expected idiom is one static handle to
+ * identify a site or task.  If a site or task of the same name has already
+ * been started during this collection, the same handle MAY be returned,
+ * but is not required to be - it is unspecified if data merging is done
+ * based on name.  These routines also take an instance variable.  Like
+ * the lexical instance, these must be 0 initialized.  Unlike the lexical
+ * instance, this is used to track a single dynamic instance.
+ *
+ * API used by the Intel(R) Parallel Advisor to describe potential concurrency
+ * and related activities. User-added source annotations expand to calls
+ * to these procedures to enable modeling of a hypothetical concurrent
+ * execution serially.
+ * @{
+ */
+#if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL)
+
+typedef void* __itt_model_site;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_site_instance;    /*!< @brief handle for dynamic instance */
+typedef void* __itt_model_task;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_task_instance;    /*!< @brief handle for dynamic instance */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum {
+    __itt_model_disable_observation,
+    __itt_model_disable_collection
+} __itt_model_disable;
+
+#endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */
+
+/**
+ * @brief ANNOTATE_SITE_BEGIN/ANNOTATE_SITE_END support.
+ *
+ * site_begin/end model a potential concurrency site.
+ * site instances may be recursively nested with themselves.
+ * site_end exits the most recently started but unended site for the current
+ * thread.  The handle passed to end may be used to validate structure.
+ * Instances of a site encountered on different threads concurrently
+ * are considered completely distinct. If the site name for two different
+ * lexical sites match, it is unspecified whether they are treated as the
+ * same or different for data presentation.
+ */
+void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_site_beginW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_site_beginA(const char *name);
+void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen);
+void ITTAPI __itt_model_site_end  (__itt_model_site *site, __itt_model_site_instance *instance);
+void ITTAPI __itt_model_site_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_site_begin,  (__itt_model_site *site, __itt_model_site_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,  (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_site_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_site_beginAL,  (const char *name, size_t siteNameLen))
+ITT_STUBV(ITTAPI, void, model_site_end,    (__itt_model_site *site, __itt_model_site_instance *instance))
+ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
+#define __itt_model_site_begin      ITTNOTIFY_VOID(model_site_begin)
+#define __itt_model_site_begin_ptr  ITTNOTIFY_NAME(model_site_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW      ITTNOTIFY_VOID(model_site_beginW)
+#define __itt_model_site_beginW_ptr  ITTNOTIFY_NAME(model_site_beginW)
+#endif
+#define __itt_model_site_beginA      ITTNOTIFY_VOID(model_site_beginA)
+#define __itt_model_site_beginA_ptr  ITTNOTIFY_NAME(model_site_beginA)
+#define __itt_model_site_beginAL      ITTNOTIFY_VOID(model_site_beginAL)
+#define __itt_model_site_beginAL_ptr  ITTNOTIFY_NAME(model_site_beginAL)
+#define __itt_model_site_end        ITTNOTIFY_VOID(model_site_end)
+#define __itt_model_site_end_ptr    ITTNOTIFY_NAME(model_site_end)
+#define __itt_model_site_end_2        ITTNOTIFY_VOID(model_site_end_2)
+#define __itt_model_site_end_2_ptr    ITTNOTIFY_NAME(model_site_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_site_begin(site, instance, name)
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW(name)
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA(name)
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL(name, siteNameLen)
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end(site, instance)
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2()
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_TASK_BEGIN/ANNOTATE_TASK_END support
+ *
+ * task_begin/end model a potential task, which is contained within the most
+ * closely enclosing dynamic site.  task_end exits the most recently started
+ * but unended task.  The handle passed to end may be used to validate
+ * structure.  It is unspecified if bad dynamic nesting is detected.  If it
+ * is, it should be encoded in the resulting data collection.  The collector
+ * should not fail due to construct nesting issues, nor attempt to directly
+ * indicate the problem.
+ */
+void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_task_beginW(const wchar_t *name);
+void ITTAPI __itt_model_iteration_taskW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_task_beginA(const char *name);
+void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_iteration_taskA(const char *name);
+void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_task_end  (__itt_model_task *task, __itt_model_task_instance *instance);
+void ITTAPI __itt_model_task_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_task_begin,  (__itt_model_task *task, __itt_model_task_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_task_beginW,  (const wchar_t *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_task_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_task_beginAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_task_end,    (__itt_model_task *task, __itt_model_task_instance *instance))
+ITT_STUBV(ITTAPI, void, model_task_end_2,  (void))
+#define __itt_model_task_begin      ITTNOTIFY_VOID(model_task_begin)
+#define __itt_model_task_begin_ptr  ITTNOTIFY_NAME(model_task_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW     ITTNOTIFY_VOID(model_task_beginW)
+#define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW)
+#define __itt_model_iteration_taskW     ITTNOTIFY_VOID(model_iteration_taskW)
+#define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW)
+#endif
+#define __itt_model_task_beginA    ITTNOTIFY_VOID(model_task_beginA)
+#define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA)
+#define __itt_model_task_beginAL    ITTNOTIFY_VOID(model_task_beginAL)
+#define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL)
+#define __itt_model_iteration_taskA    ITTNOTIFY_VOID(model_iteration_taskA)
+#define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA)
+#define __itt_model_iteration_taskAL    ITTNOTIFY_VOID(model_iteration_taskAL)
+#define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL)
+#define __itt_model_task_end        ITTNOTIFY_VOID(model_task_end)
+#define __itt_model_task_end_ptr    ITTNOTIFY_NAME(model_task_end)
+#define __itt_model_task_end_2        ITTNOTIFY_VOID(model_task_end_2)
+#define __itt_model_task_end_2_ptr    ITTNOTIFY_NAME(model_task_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_task_begin(task, instance, name)
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW(name)
+#define __itt_model_task_beginW_ptr  0
+#endif
+#define __itt_model_task_beginA(name)
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL(name, siteNameLen)
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA(name)
+#define __itt_model_iteration_taskA_ptr  0
+#define __itt_model_iteration_taskAL(name, siteNameLen)
+#define __itt_model_iteration_taskAL_ptr  0
+#define __itt_model_task_end(task, instance)
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2()
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW_ptr 0
+#endif
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA_ptr    0
+#define __itt_model_iteration_taskAL_ptr    0
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_LOCK_ACQUIRE/ANNOTATE_LOCK_RELEASE support
+ *
+ * lock_acquire/release model a potential lock for both lockset and
+ * performance modeling.  Each unique address is modeled as a separate
+ * lock, with invalid addresses being valid lock IDs.  Specifically:
+ * no storage is accessed by the API at the specified address - it is only
+ * used for lock identification.  Lock acquires may be self-nested and are
+ * unlocked by a corresponding number of releases.
+ * (These closely correspond to __itt_sync_acquired/__itt_sync_releasing,
+ * but may not have identical semantics.)
+ */
+void ITTAPI __itt_model_lock_acquire(void *lock);
+void ITTAPI __itt_model_lock_acquire_2(void *lock);
+void ITTAPI __itt_model_lock_release(void *lock);
+void ITTAPI __itt_model_lock_release_2(void *lock);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
+#define __itt_model_lock_acquire     ITTNOTIFY_VOID(model_lock_acquire)
+#define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire)
+#define __itt_model_lock_acquire_2     ITTNOTIFY_VOID(model_lock_acquire_2)
+#define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2)
+#define __itt_model_lock_release     ITTNOTIFY_VOID(model_lock_release)
+#define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release)
+#define __itt_model_lock_release_2     ITTNOTIFY_VOID(model_lock_release_2)
+#define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_lock_acquire(lock)
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2(lock)
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release(lock)
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2(lock)
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_RECORD_ALLOCATION/ANNOTATE_RECORD_DEALLOCATION support
+ *
+ * record_allocation/deallocation describe user-defined memory allocator
+ * behavior, which may be required for correctness modeling to understand
+ * when storage is not expected to be actually reused across threads.
+ */
+void ITTAPI __itt_model_record_allocation  (void *addr, size_t size);
+void ITTAPI __itt_model_record_deallocation(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size))
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
+#define __itt_model_record_allocation       ITTNOTIFY_VOID(model_record_allocation)
+#define __itt_model_record_allocation_ptr   ITTNOTIFY_NAME(model_record_allocation)
+#define __itt_model_record_deallocation     ITTNOTIFY_VOID(model_record_deallocation)
+#define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_record_allocation(addr, size)
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation(addr)
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_INDUCTION_USES support
+ *
+ * Note particular storage is inductive through the end of the current site
+ */
+void ITTAPI __itt_model_induction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
+#define __itt_model_induction_uses     ITTNOTIFY_VOID(model_induction_uses)
+#define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_induction_uses(addr, size)
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_REDUCTION_USES support
+ *
+ * Note particular storage is used for reduction through the end
+ * of the current site
+ */
+void ITTAPI __itt_model_reduction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
+#define __itt_model_reduction_uses     ITTNOTIFY_VOID(model_reduction_uses)
+#define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_reduction_uses(addr, size)
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_OBSERVE_USES support
+ *
+ * Have correctness modeling record observations about uses of storage
+ * through the end of the current site
+ */
+void ITTAPI __itt_model_observe_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
+#define __itt_model_observe_uses     ITTNOTIFY_VOID(model_observe_uses)
+#define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_observe_uses(addr, size)
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_CLEAR_USES support
+ *
+ * Clear the special handling of a piece of storage related to induction,
+ * reduction or observe_uses
+ */
+void ITTAPI __itt_model_clear_uses(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr))
+#define __itt_model_clear_uses     ITTNOTIFY_VOID(model_clear_uses)
+#define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_clear_uses(addr)
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_DISABLE_*_PUSH/ANNOTATE_DISABLE_*_POP support
+ *
+ * disable_push/disable_pop push and pop disabling based on a parameter.
+ * Disabling observations stops processing of memory references during
+ * correctness modeling, and all annotations that occur in the disabled
+ * region.  This allows description of code that is expected to be handled
+ * specially during conversion to parallelism or that is not recognized
+ * by tools (e.g. some kinds of synchronization operations.)
+ * This mechanism causes all annotations in the disabled region, other
+ * than disable_push and disable_pop, to be ignored.  (For example, this
+ * might validly be used to disable an entire parallel site and the contained
+ * tasks and locking in it for data collection purposes.)
+ * The disable for collection is a more expensive operation, but reduces
+ * collector overhead significantly.  This applies to BOTH correctness data
+ * collection and performance data collection.  For example, a site
+ * containing a task might only enable data collection for the first 10
+ * iterations.  Both performance and correctness data should reflect this,
+ * and the program should run as close to full speed as possible when
+ * collection is disabled.
+ */
+void ITTAPI __itt_model_disable_push(__itt_model_disable x);
+void ITTAPI __itt_model_disable_pop(void);
+void ITTAPI __itt_model_aggregate_task(size_t x);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x))
+ITT_STUBV(ITTAPI, void, model_disable_pop,  (void))
+ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
+#define __itt_model_disable_push     ITTNOTIFY_VOID(model_disable_push)
+#define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push)
+#define __itt_model_disable_pop      ITTNOTIFY_VOID(model_disable_pop)
+#define __itt_model_disable_pop_ptr  ITTNOTIFY_NAME(model_disable_pop)
+#define __itt_model_aggregate_task      ITTNOTIFY_VOID(model_aggregate_task)
+#define __itt_model_aggregate_task_ptr  ITTNOTIFY_NAME(model_aggregate_task)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_disable_push(x)
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop()
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task(x)
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} model group */
+
+/**
+ * @defgroup heap Heap
+ * @ingroup public
+ * Heap group
+ * @{
+ */
+
+typedef void* __itt_heap_function;
+
+/**
+ * @brief Create an identification for heap function
+ * @return non-zero identifier or NULL
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_heap_function ITTAPI __itt_heap_function_createA(const char*    name, const char*    domain);
+__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_heap_function_create     __itt_heap_function_createW
+#  define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
+#else
+#  define __itt_heap_function_create     __itt_heap_function_createA
+#  define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char*    name, const char*    domain))
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    name, const char*    domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA     ITTNOTIFY_DATA(heap_function_createA)
+#define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA)
+#define __itt_heap_function_createW     ITTNOTIFY_DATA(heap_function_createW)
+#define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create      ITTNOTIFY_DATA(heap_function_create)
+#define __itt_heap_function_create_ptr  ITTNOTIFY_NAME(heap_function_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create(name, domain)  (__itt_heap_function)0
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation begin occurrence.
+ */
+void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized))
+#define __itt_heap_allocate_begin     ITTNOTIFY_VOID(heap_allocate_begin)
+#define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_begin(h, size, initialized)
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation end occurrence.
+ */
+void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized))
+#define __itt_heap_allocate_end     ITTNOTIFY_VOID(heap_allocate_end)
+#define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_end(h, addr, size, initialized)
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free begin occurrence.
+ */
+void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_begin     ITTNOTIFY_VOID(heap_free_begin)
+#define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_begin(h, addr)
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an free end occurrence.
+ */
+void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_end     ITTNOTIFY_VOID(heap_free_end)
+#define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_end(h, addr)
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation begin occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_begin     ITTNOTIFY_VOID(heap_reallocate_begin)
+#define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_begin(h, addr, new_size, initialized)
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an reallocation end occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_end     ITTNOTIFY_VOID(heap_reallocate_end)
+#define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized)
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access begin */
+void ITTAPI __itt_heap_internal_access_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin,  (void))
+#define __itt_heap_internal_access_begin      ITTNOTIFY_VOID(heap_internal_access_begin)
+#define __itt_heap_internal_access_begin_ptr  ITTNOTIFY_NAME(heap_internal_access_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_begin()
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access end */
+void ITTAPI __itt_heap_internal_access_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void))
+#define __itt_heap_internal_access_end     ITTNOTIFY_VOID(heap_internal_access_end)
+#define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_end()
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth begin */
+void ITTAPI __itt_heap_record_memory_growth_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin,  (void))
+#define __itt_heap_record_memory_growth_begin      ITTNOTIFY_VOID(heap_record_memory_growth_begin)
+#define __itt_heap_record_memory_growth_begin_ptr  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_begin()
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth end */
+void ITTAPI __itt_heap_record_memory_growth_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
+#define __itt_heap_record_memory_growth_end     ITTNOTIFY_VOID(heap_record_memory_growth_end)
+#define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_end()
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Specify the type of heap detection/reporting to modify.
+ */
+/**
+ * @hideinitializer
+ * @brief Report on memory leaks.
+ */
+#define __itt_heap_leaks 0x00000001
+
+/**
+ * @hideinitializer
+ * @brief Report on memory growth.
+ */
+#define __itt_heap_growth 0x00000002
+
+
+/** @brief heap reset detection */
+void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reset_detection,  (unsigned int reset_mask))
+#define __itt_heap_reset_detection      ITTNOTIFY_VOID(heap_reset_detection)
+#define __itt_heap_reset_detection_ptr  ITTNOTIFY_NAME(heap_reset_detection)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reset_detection()
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief report */
+void ITTAPI __itt_heap_record(unsigned int record_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
+#define __itt_heap_record     ITTNOTIFY_VOID(heap_record)
+#define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record()
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} heap group */
+/** @endcond */
+/* ========================================================================== */
+
+/**
+ * @defgroup domains Domains
+ * @ingroup public
+ * Domains group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_domain
+{
+    volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */
+    const char* nameA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved to the runtime */
+    void* extra2; /*!< Reserved to the runtime */
+    struct ___itt_domain* next;
+} __itt_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup domains
+ * @brief Create a domain.
+ * Create domain using some domain name: the URI naming style is recommended.
+ * Because the set of domains is expected to be static over the application's
+ * execution time, there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of
+ * which thread created the domain. This call is thread-safe.
+ * @param[in] name name of domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_domain* ITTAPI __itt_domain_createA(const char    *name);
+__itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_domain_create     __itt_domain_createW
+#  define __itt_domain_create_ptr __itt_domain_createW_ptr
+#else /* UNICODE */
+#  define __itt_domain_create     __itt_domain_createA
+#  define __itt_domain_create_ptr __itt_domain_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_domain* ITTAPI __itt_domain_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA     ITTNOTIFY_DATA(domain_createA)
+#define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA)
+#define __itt_domain_createW     ITTNOTIFY_DATA(domain_createW)
+#define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create     ITTNOTIFY_DATA(domain_create)
+#define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA(name) (__itt_domain*)0
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW(name) (__itt_domain*)0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create(name)  (__itt_domain*)0
+#define __itt_domain_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} domains group */
+
+/**
+ * @defgroup ids IDs
+ * @ingroup public
+ * IDs group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_id
+{
+    unsigned long long d1, d2, d3;
+} __itt_id;
+
+#pragma pack(pop)
+/** @endcond */
+
+static const __itt_id __itt_null = { 0, 0, 0 };
+
+/**
+ * @ingroup ids
+ * @brief A convenience function is provided to create an ID without domain control.
+ * @brief This is a convenience function to initialize an __itt_id structure. This function
+ * does not affect the collector runtime in any way. After you make the ID with this
+ * function, you still must create it with the __itt_id_create function before using the ID
+ * to identify a named entity.
+ * @param[in] addr The address of object; high QWORD of the ID value.
+ * @param[in] extra The extra data to unique identify object; low QWORD of the ID value.
+ */
+
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra)
+{
+    __itt_id id = __itt_null;
+    id.d1 = (unsigned long long)((uintptr_t)addr);
+    id.d2 = (unsigned long long)extra;
+    id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
+    return id;
+}
+
+/**
+ * @ingroup ids
+ * @brief Create an instance of identifier.
+ * This establishes the beginning of the lifetime of an instance of
+ * the given ID in the trace. Once this lifetime starts, the ID
+ * can be used to tag named entity instances in calls such as
+ * __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * Instance IDs are not domain specific!
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x)
+#define __itt_id_create_ptr  ITTNOTIFY_NAME(id_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create(domain,id)
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup ids
+ * @brief Destroy an instance of identifier.
+ * This ends the lifetime of the current instance of the given ID value in the trace.
+ * Any relationships that are established after this lifetime ends are invalid.
+ * This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x)
+#define __itt_id_destroy_ptr  ITTNOTIFY_NAME(id_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_destroy(domain,id)
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} ids group */
+
+/**
+ * @defgroup handless String Handles
+ * @ingroup public
+ * String Handles group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_string_handle
+{
+    const char* strA; /*!< Copy of original string in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* strW; /*!< Copy of original string in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* strW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved. Must be zero   */
+    void* extra2; /*!< Reserved. Must be zero   */
+    struct ___itt_string_handle* next;
+} __itt_string_handle;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup handles
+ * @brief Create a string handle.
+ * Create and return handle value that can be associated with a string.
+ * Consecutive calls to __itt_string_handle_create with the same name
+ * return the same value. Because the set of string handles is expected to remain
+ * static during the application's execution time, there is no mechanism to destroy a string handle.
+ * Any string handle can be accessed by any thread in the process, regardless of which thread created
+ * the string handle. This call is thread-safe.
+ * @param[in] name The input string
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_string_handle* ITTAPI __itt_string_handle_createA(const char    *name);
+__itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_string_handle_create     __itt_string_handle_createW
+#  define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
+#else /* UNICODE */
+#  define __itt_string_handle_create     __itt_string_handle_createA
+#  define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_string_handle* ITTAPI __itt_string_handle_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA     ITTNOTIFY_DATA(string_handle_createA)
+#define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA)
+#define __itt_string_handle_createW     ITTNOTIFY_DATA(string_handle_createW)
+#define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create     ITTNOTIFY_DATA(string_handle_create)
+#define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA(name) (__itt_string_handle*)0
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW(name) (__itt_string_handle*)0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create(name)  (__itt_string_handle*)0
+#define __itt_string_handle_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} handles group */
+
+/** @cond exclude_from_documentation */
+typedef unsigned long long __itt_timestamp;
+/** @endcond */
+
+#define __itt_timestamp_none ((__itt_timestamp)-1LL)
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @ingroup timestamps
+ * @brief Return timestamp corresponding to the current moment.
+ * This returns the timestamp in the format that is the most relevant for the current
+ * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
+ * compare __itt_timestamp values.
+ */
+__itt_timestamp ITTAPI __itt_get_timestamp(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
+#define __itt_get_timestamp      ITTNOTIFY_DATA(get_timestamp)
+#define __itt_get_timestamp_ptr  ITTNOTIFY_NAME(get_timestamp)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_get_timestamp()
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} timestamps */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @defgroup regions Regions
+ * @ingroup public
+ * Regions group
+ * @{
+ */
+/**
+ * @ingroup regions
+ * @brief Begin of region instance.
+ * Successive calls to __itt_region_begin with the same ID are ignored
+ * until a call to __itt_region_end with the same ID
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance. Must not be __itt_null
+ * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null
+ * @param[in] name The name of this region
+ */
+void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup regions
+ * @brief End of region instance.
+ * The first call to __itt_region_end with a given ID ends the
+ * region. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_region_begin call.
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance
+ */
+void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
+#define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z)
+#define __itt_region_begin_ptr      ITTNOTIFY_NAME(region_begin)
+#define __itt_region_end(d,x)       ITTNOTIFY_VOID_D1(region_end,d,x)
+#define __itt_region_end_ptr        ITTNOTIFY_NAME(region_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_region_begin(d,x,y,z)
+#define __itt_region_begin_ptr 0
+#define __itt_region_end(d,x)
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_region_begin_ptr 0
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} regions group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup public
+ * Frames are similar to regions, but are intended to be easier to use and to implement.
+ * In particular:
+ * - Frames always represent periods of elapsed time
+ * - By default, frames have no nesting relationships
+ * @{
+ */
+
+/**
+ * @ingroup frames
+ * @brief Begin a frame instance.
+ * Successive calls to __itt_frame_begin with the
+ * same ID are ignored until a call to __itt_frame_end with the same ID.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ */
+void ITTAPI __itt_frame_begin_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief End a frame instance.
+ * The first call to __itt_frame_end with a given ID
+ * ends the frame. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_frame_begin call.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL for current
+ */
+void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief Submits a frame instance.
+ * Successive calls to __itt_frame_begin or __itt_frame_submit with the
+ * same ID are ignored until a call to __itt_frame_end or __itt_frame_submit
+ * with the same ID.
+ * Passing special __itt_timestamp_none value as "end" argument means
+ * take the current timestamp as the end timestamp.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ * @param[in] begin Timestamp of the beginning of the frame
+ * @param[in] end Timestamp of the end of the frame
+ */
+void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
+    __itt_timestamp begin, __itt_timestamp end);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
+#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
+#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
+#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
+#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
+#define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
+#define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin_v3(domain,id)
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3(domain,id)
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3(domain,id,begin,end)
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+/** @endcond */
+
+/**
+ * @defgroup taskgroup Task Group
+ * @ingroup public
+ * Task Group
+ * @{
+ */
+/**
+ * @ingroup task_groups
+ * @brief Denotes a task_group instance.
+ * Successive calls to __itt_task_group with the same ID are ignored.
+ * @param[in] domain The domain for this task_group instance
+ * @param[in] id The instance ID for this task_group instance. Must not be __itt_null.
+ * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null.
+ * @param[in] name The name of this task_group
+ */
+void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+#define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z)
+#define __itt_task_group_ptr      ITTNOTIFY_NAME(task_group)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_group(d,x,y,z)
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} taskgroup group */
+
+/**
+ * @defgroup tasks Tasks
+ * @ingroup public
+ * A task instance represents a piece of work performed by a particular
+ * thread for a period of time. A call to __itt_task_begin creates a
+ * task instance. This becomes the current instance for that task on that
+ * thread. A following call to __itt_task_end on the same thread ends the
+ * instance. There may be multiple simultaneous instances of tasks with the
+ * same name on different threads. If an ID is specified, the task instance
+ * receives that ID. Nested tasks are allowed.
+ *
+ * Note: The task is defined by the bracketing of __itt_task_begin and
+ * __itt_task_end on the same thread. If some scheduling mechanism causes
+ * task switching (the thread executes a different user task) or task
+ * switching (the user task switches to a different thread) then this breaks
+ * the notion of  current instance. Additional API calls are required to
+ * deal with that possibility.
+ * @{
+ */
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The identifier for this task instance (may be 0)
+ * @param[in] parentid The parent of this task (may be 0)
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup tasks
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ */
+void ITTAPI __itt_task_end(const __itt_domain *domain);
+
+/**
+ * @ingroup tasks
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup tasks
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain))
+ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped,   (const __itt_domain *domain, __itt_id taskid))
+#define __itt_task_begin(d,x,y,z)    ITTNOTIFY_VOID_D3(task_begin,d,x,y,z)
+#define __itt_task_begin_ptr         ITTNOTIFY_NAME(task_begin)
+#define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z)
+#define __itt_task_begin_fn_ptr      ITTNOTIFY_NAME(task_begin_fn)
+#define __itt_task_end(d)            ITTNOTIFY_VOID_D0(task_end,d)
+#define __itt_task_end_ptr           ITTNOTIFY_NAME(task_end)
+#define __itt_task_begin_overlapped(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z)
+#define __itt_task_begin_overlapped_ptr      ITTNOTIFY_NAME(task_begin_overlapped)
+#define __itt_task_end_overlapped(d,x)       ITTNOTIFY_VOID_D1(task_end_overlapped,d,x)
+#define __itt_task_end_overlapped_ptr        ITTNOTIFY_NAME(task_end_overlapped)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin(domain,id,parentid,name)
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn(domain,id,parentid,fn)
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end(domain)
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped(domain,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ptr         0
+#define __itt_task_end_overlapped(domain,taskid)
+#define __itt_task_end_overlapped_ptr           0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped_ptr 0
+#define __itt_task_end_overlapped_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} tasks group */
+
+
+/**
+ * @defgroup markers Markers
+ * Markers represent a single discreet event in time. Markers have a scope,
+ * described by an enumerated type __itt_scope. Markers are created by
+ * the API call __itt_marker. A marker instance can be given an ID for use in
+ * adding metadata.
+ * @{
+ */
+
+/**
+ * @brief Describes the scope of an event object in the trace.
+ */
+typedef enum
+{
+    __itt_scope_unknown = 0,
+    __itt_scope_global,
+    __itt_scope_track_group,
+    __itt_scope_track,
+    __itt_scope_task,
+    __itt_scope_marker
+} __itt_scope;
+
+/** @cond exclude_from_documentation */
+#define __itt_marker_scope_unknown  __itt_scope_unknown
+#define __itt_marker_scope_global   __itt_scope_global
+#define __itt_marker_scope_process  __itt_scope_track_group
+#define __itt_marker_scope_thread   __itt_scope_track
+#define __itt_marker_scope_task     __itt_scope_task
+/** @endcond */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance
+ * @param[in] domain The domain for this marker
+ * @param[in] id The instance ID for this marker or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z)
+#define __itt_marker_ptr      ITTNOTIFY_NAME(marker)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker(domain,id,name,scope)
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} markers group */
+
+/**
+ * @defgroup metadata Metadata
+ * The metadata API is used to attach extra information to named
+ * entities. Metadata can be attached to an identified named entity by ID,
+ * or to the current entity (which is always a task).
+ *
+ * Conceptually metadata has a type (what kind of metadata), a key (the
+ * name of the metadata), and a value (the actual data). The encoding of
+ * the value depends on the type of the metadata.
+ *
+ * The type of metadata is specified by an enumerated type __itt_metdata_type.
+ * @{
+ */
+
+/**
+ * @ingroup parameters
+ * @brief describes the type of metadata
+ */
+typedef enum {
+    __itt_metadata_unknown = 0,
+    __itt_metadata_u64,     /**< Unsigned 64-bit integer */
+    __itt_metadata_s64,     /**< Signed 64-bit integer */
+    __itt_metadata_u32,     /**< Unsigned 32-bit integer */
+    __itt_metadata_s32,     /**< Signed 32-bit integer */
+    __itt_metadata_u16,     /**< Unsigned 16-bit integer */
+    __itt_metadata_s16,     /**< Signed 16-bit integer */
+    __itt_metadata_float,   /**< Signed 32-bit floating-point */
+    __itt_metadata_double   /**< SIgned 64-bit floating-point */
+} __itt_metadata_type;
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b)
+#define __itt_metadata_add_ptr          ITTNOTIFY_NAME(metadata_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add(d,x,y,z,a,b)
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add     __itt_metadata_str_addW
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add     __itt_metadata_str_addA
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr        ITTNOTIFY_NAME(metadata_str_addA)
+#define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr        ITTNOTIFY_NAME(metadata_str_addW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a)
+#define __itt_metadata_str_add_ptr         ITTNOTIFY_NAME(metadata_str_add)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW(d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)
+#define __itt_metadata_str_add_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr          ITTNOTIFY_NAME(metadata_add_with_scope)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeW
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeA
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr         ITTNOTIFY_NAME(metadata_str_add_with_scope)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} metadata group */
+
+/**
+ * @defgroup relations Relations
+ * Instances of named entities can be explicitly associated with other
+ * instances using instance IDs and the relationship API calls.
+ *
+ * @{
+ */
+
+/**
+ * @ingroup relations
+ * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation.
+ * Relations between instances can be added with an API call. The relation
+ * API uses instance IDs. Relations can be added before or after the actual
+ * instances are created and persist independently of the instances. This
+ * is the motivation for having different lifetimes for instance IDs and
+ * the actual instances.
+ */
+typedef enum
+{
+    __itt_relation_is_unknown = 0,
+    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
+    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
+    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
+    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
+    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
+    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
+    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+} __itt_relation;
+
+/**
+ * @ingroup relations
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup relations
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y)
+#define __itt_relation_add_to_current_ptr    ITTNOTIFY_NAME(relation_add_to_current)
+#define __itt_relation_add(d,x,y,z)          ITTNOTIFY_VOID_D3(relation_add,d,x,y,z)
+#define __itt_relation_add_ptr               ITTNOTIFY_NAME(relation_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current(d,x,y)
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add(d,x,y,z)
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} relations group */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_info
+{
+    unsigned long long clock_freq; /*!< Clock domain frequency */
+    unsigned long long clock_base; /*!< Clock domain base timestamp */
+} __itt_clock_info;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_domain
+{
+    __itt_clock_info info;      /*!< Most recent clock domain info */
+    __itt_get_clock_info_fn fn; /*!< Callback function pointer */
+    void* fn_data;              /*!< Input argument for the callback function */
+    int   extra1;               /*!< Reserved. Must be zero */
+    void* extra2;               /*!< Reserved. Must be zero */
+    struct ___itt_clock_domain* next;
+} __itt_clock_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Create a clock domain.
+ * Certain applications require the capability to trace their application using
+ * a clock domain different than the CPU, for instance the instrumentation of events
+ * that occur on a GPU.
+ * Because the set of domains is expected to be static over the application's execution time,
+ * there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of which thread created
+ * the domain. This call is thread-safe.
+ * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps
+ * @param[in] fn_data Argument for a callback function; may be NULL
+ */
+__itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data))
+#define __itt_clock_domain_create     ITTNOTIFY_DATA(clock_domain_create)
+#define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Recalculate clock domains frequences and clock base timestamps.
+ */
+void ITTAPI __itt_clock_domain_reset(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, clock_domain_reset, (void))
+#define __itt_clock_domain_reset     ITTNOTIFY_VOID(clock_domain_reset)
+#define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_reset()
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Create an instance of identifier. This establishes the beginning of the lifetime of
+ * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to
+ * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/**
+ * @ingroup clockdomain
+ * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the
+ * given ID value in the trace. Any relationships that are established after this lifetime ends are
+ * invalid. This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+#define __itt_id_create_ex(d,x,y,z)  ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z)
+#define __itt_id_create_ex_ptr       ITTNOTIFY_NAME(id_create_ex)
+#define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z)
+#define __itt_id_destroy_ex_ptr      ITTNOTIFY_NAME(id_destroy_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, or __itt_null
+ * @param[in] parentid The parent of this task, or __itt_null
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup clockdomain
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ */
+void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_ex,        (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex,     (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp))
+#define __itt_task_begin_ex(d,x,y,z,a,b)      ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b)
+#define __itt_task_begin_ex_ptr               ITTNOTIFY_NAME(task_begin_ex)
+#define __itt_task_begin_fn_ex(d,x,y,z,a,b)   ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b)
+#define __itt_task_begin_fn_ex_ptr            ITTNOTIFY_NAME(task_begin_fn_ex)
+#define __itt_task_end_ex(d,x,y)              ITTNOTIFY_VOID_D2(task_end_ex,d,x,y)
+#define __itt_task_end_ex_ptr                 ITTNOTIFY_NAME(task_end_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name)
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn)
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex(domain,clock_domain,timestamp)
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup counters Counters
+ * @ingroup public
+ * Counters are user-defined objects with a monotonically increasing
+ * value. Counter values are 64-bit unsigned integers.
+ * Counters have names that can be displayed in
+ * the tools.
+ * @{
+ */
+
+/**
+ * @brief opaque structure for counter identification
+ */
+/** @cond exclude_from_documentation */
+
+typedef struct ___itt_counter* __itt_counter;
+
+/**
+ * @brief Create an unsigned 64 bits integer counter with given name/domain
+ *
+ * After __itt_counter_create() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter, where value_ptr is a pointer to an unsigned 64 bits integer
+ *
+ * The call is equal to __itt_counter_create_typed(name, domain, __itt_metadata_u64)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA(const char    *name, const char    *domain);
+__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create     __itt_counter_createW
+#  define __itt_counter_create_ptr __itt_counter_createW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create     __itt_counter_createA
+#  define __itt_counter_create_ptr __itt_counter_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char *name, const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA     ITTNOTIFY_DATA(counter_createA)
+#define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA)
+#define __itt_counter_createW     ITTNOTIFY_DATA(counter_createW)
+#define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create     ITTNOTIFY_DATA(counter_create)
+#define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA(name, domain)
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW(name, domain)
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create(name, domain)
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Increment the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id))
+#define __itt_counter_inc     ITTNOTIFY_VOID(counter_inc)
+#define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc(id)
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Increment the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_inc_delta     ITTNOTIFY_VOID(counter_inc_delta)
+#define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_delta(id, value)
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id))
+#define __itt_counter_dec     ITTNOTIFY_VOID(counter_dec)
+#define __itt_counter_dec_ptr ITTNOTIFY_NAME(counter_dec)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec(id)
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_dec_delta     ITTNOTIFY_VOID(counter_dec_delta)
+#define __itt_counter_dec_delta_ptr ITTNOTIFY_NAME(counter_dec_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_delta(id, value)
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls increment the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to increment the counter
+ */
+void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_inc_v3(d,x)         ITTNOTIFY_VOID_D1(counter_inc_v3,d,x)
+#define __itt_counter_inc_v3_ptr          ITTNOTIFY_NAME(counter_inc_v3)
+#define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y)
+#define __itt_counter_inc_delta_v3_ptr    ITTNOTIFY_NAME(counter_inc_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_v3(domain,name)
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3(domain,name,delta)
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls decrement the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to decrement the counter
+ */
+void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_dec_v3(d,x)         ITTNOTIFY_VOID_D1(counter_dec_v3,d,x)
+#define __itt_counter_dec_v3_ptr          ITTNOTIFY_NAME(counter_dec_v3)
+#define __itt_counter_dec_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_dec_delta_v3,d,x,y)
+#define __itt_counter_dec_delta_v3_ptr    ITTNOTIFY_NAME(counter_dec_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_v3(domain,name)
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3(domain,name,delta)
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} counters group */
+
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value(__itt_counter id, void *value_ptr);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr))
+#define __itt_counter_set_value     ITTNOTIFY_VOID(counter_set_value)
+#define __itt_counter_set_value_ptr ITTNOTIFY_NAME(counter_set_value)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value(id, value_ptr)
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value_ex(__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr))
+#define __itt_counter_set_value_ex     ITTNOTIFY_VOID(counter_set_value_ex)
+#define __itt_counter_set_value_ex_ptr ITTNOTIFY_NAME(counter_set_value_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create a typed counter with given name/domain
+ *
+ * After __itt_counter_create_typed() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_create_typedA(const char    *name, const char    *domain, __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create_typed     __itt_counter_create_typedW
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create_typed     __itt_counter_create_typedA
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create_typed(const char *name, const char *domain, __itt_metadata_type type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type))
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char *name, const char *domain, __itt_metadata_type type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA     ITTNOTIFY_DATA(counter_create_typedA)
+#define __itt_counter_create_typedA_ptr ITTNOTIFY_NAME(counter_create_typedA)
+#define __itt_counter_create_typedW     ITTNOTIFY_DATA(counter_create_typedW)
+#define __itt_counter_create_typedW_ptr ITTNOTIFY_NAME(counter_create_typedW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed     ITTNOTIFY_DATA(counter_create_typed)
+#define __itt_counter_create_typed_ptr ITTNOTIFY_NAME(counter_create_typed)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA(name, domain, type)
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW(name, domain, type)
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed(name, domain, type)
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create() or
+ * __itt_counter_create_typed()
+ */
+void ITTAPI __itt_counter_destroy(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
+#define __itt_counter_destroy     ITTNOTIFY_VOID(counter_destroy)
+#define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_destroy(id)
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} counters group */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance.
+ * @param[in] domain The domain for this marker
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The instance ID for this marker, or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker_ex(d,x,y,z,a,b)    ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b)
+#define __itt_marker_ex_ptr             ITTNOTIFY_NAME(marker_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope)
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a)
+#define __itt_relation_add_to_current_ex_ptr        ITTNOTIFY_NAME(relation_add_to_current_ex)
+#define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
+#define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum ___itt_track_group_type
+{
+    __itt_track_group_type_normal = 0
+} __itt_track_group_type;
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track_group
+{
+    __itt_string_handle* name;     /*!< Name of the track group */
+    struct ___itt_track* track;    /*!< List of child tracks    */
+    __itt_track_group_type tgtype; /*!< Type of the track group */
+    int   extra1;                  /*!< Reserved. Must be zero  */
+    void* extra2;                  /*!< Reserved. Must be zero  */
+    struct ___itt_track_group* next;
+} __itt_track_group;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Placeholder for custom track types. Currently, "normal" custom track
+ * is the only available track type.
+ */
+typedef enum ___itt_track_type
+{
+    __itt_track_type_normal = 0
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+    , __itt_track_type_queue
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
+} __itt_track_type;
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track
+{
+    __itt_string_handle* name; /*!< Name of the track group */
+    __itt_track_group* group;  /*!< Parent group to a track */
+    __itt_track_type ttype;    /*!< Type of the track       */
+    int   extra1;              /*!< Reserved. Must be zero  */
+    void* extra2;              /*!< Reserved. Must be zero  */
+    struct ___itt_track* next;
+} __itt_track;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Create logical track group.
+ */
+__itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type))
+#define __itt_track_group_create     ITTNOTIFY_DATA(track_group_create)
+#define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_group_create(name)  (__itt_track_group*)0
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create logical track.
+ */
+__itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type))
+#define __itt_track_create     ITTNOTIFY_DATA(track_create)
+#define __itt_track_create_ptr ITTNOTIFY_NAME(track_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_create(track_group,name,track_type)  (__itt_track*)0
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the logical track.
+ */
+void ITTAPI __itt_set_track(__itt_track* track);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track))
+#define __itt_set_track     ITTNOTIFY_VOID(set_track)
+#define __itt_set_track_ptr ITTNOTIFY_NAME(set_track)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_set_track(track)
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/* ========================================================================== */
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup events Events
+ * @ingroup public
+ * Events group
+ * @{
+ */
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} events group */
+
+
+/**
+ * @defgroup arrays Arrays Visualizer
+ * @ingroup public
+ * Visualize arrays
+ * @{
+ */
+
+/**
+ * @enum __itt_av_data_type
+ * @brief Defines types of arrays data (for C/C++ intrinsic types)
+ */
+typedef enum
+{
+    __itt_e_first = 0,
+    __itt_e_char = 0,  /* 1-byte integer */
+    __itt_e_uchar,     /* 1-byte unsigned integer */
+    __itt_e_int16,     /* 2-byte integer */
+    __itt_e_uint16,    /* 2-byte unsigned integer  */
+    __itt_e_int32,     /* 4-byte integer */
+    __itt_e_uint32,    /* 4-byte unsigned integer */
+    __itt_e_int64,     /* 8-byte integer */
+    __itt_e_uint64,    /* 8-byte unsigned integer */
+    __itt_e_float,     /* 4-byte floating */
+    __itt_e_double,    /* 8-byte floating */
+    __itt_e_last = __itt_e_double
+} __itt_av_data_type;
+
+/**
+ * @brief Save an array data to a file.
+ * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
+ * @param[in] data - pointer to the array data
+ * @param[in] rank - the rank of the array
+ * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions.
+ * The size of dimensions must be equal to the rank
+ * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
+ * @param[in] filePath - the file path; the output format is defined by the file extension
+ * @param[in] columnOrder - defines how the array is stored in the linear memory.
+ * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C).
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_av_save     __itt_av_saveW
+#  define __itt_av_save_ptr __itt_av_saveW_ptr
+#else /* UNICODE */
+#  define __itt_av_save     __itt_av_saveA
+#  define __itt_av_save_ptr __itt_av_saveA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA     ITTNOTIFY_DATA(av_saveA)
+#define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA)
+#define __itt_av_saveW     ITTNOTIFY_DATA(av_saveW)
+#define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save     ITTNOTIFY_DATA(av_save)
+#define __itt_av_save_ptr ITTNOTIFY_NAME(av_save)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA(name)
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW(name)
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save(name)
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+void ITTAPI __itt_enable_attach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, enable_attach, (void))
+#define __itt_enable_attach     ITTNOTIFY_VOID(enable_attach)
+#define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_enable_attach()
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/** @} arrays group */
+
+/** @endcond */
+
+/**
+ * @brief Module load info
+ * This API is used to report necessary information in case of module relocation
+ * @param[in] start_addr - relocated module start address
+ * @param[in] end_addr - relocated module end address
+ * @param[in] path - file system path to the module
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path);
+void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr, const wchar_t *path);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_module_load     __itt_module_loadW
+#  define __itt_module_load_ptr __itt_module_loadW_ptr
+#else /* UNICODE */
+#  define __itt_module_load     __itt_module_loadA
+#  define __itt_module_load_ptr __itt_module_loadA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_module_load(void *start_addr, void *end_addr, const char *path);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, void, module_loadA, (void *start_addr, void *end_addr, const char *path))
+ITT_STUB(ITTAPI, void, module_loadW, (void *start_addr, void *end_addr, const wchar_t *path))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const char *path))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA     ITTNOTIFY_VOID(module_loadA)
+#define __itt_module_loadA_ptr ITTNOTIFY_NAME(module_loadA)
+#define __itt_module_loadW     ITTNOTIFY_VOID(module_loadW)
+#define __itt_module_loadW_ptr ITTNOTIFY_NAME(module_loadW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load     ITTNOTIFY_VOID(module_load)
+#define __itt_module_load_ptr ITTNOTIFY_NAME(module_load)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA(start_addr, end_addr, path)
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW(start_addr, end_addr, path)
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load(start_addr, end_addr, path)
+#define __itt_module_load_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_H_ */
+
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+
+#ifndef _ITTNOTIFY_PRIVATE_
+#define _ITTNOTIFY_PRIVATE_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,       (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid))
+#define __itt_task_begin_overlapped_ex(d,x,y,z,a,b)     ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b)
+#define __itt_task_begin_overlapped_ex_ptr              ITTNOTIFY_NAME(task_begin_overlapped_ex)
+#define __itt_task_end_overlapped_ex(d,x,y,z)           ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z)
+#define __itt_task_end_overlapped_ex_ptr                ITTNOTIFY_NAME(task_end_overlapped_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid)
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ptr           0
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup makrs_internal Marks
+ * @ingroup internal
+ * Marks group
+ * @warning Internal API:
+ *   - It is not shipped to outside of Intel
+ *   - It is delivered to internal Intel teams using e-mail or SVN access only
+ * @{
+ */
+/** @brief user mark type */
+typedef int __itt_mark_type;
+
+/**
+ * @brief Creates a user mark type with the specified name using char or Unicode string.
+ * @param[in] name - name of mark to create
+ * @return Returns a handle to the mark type
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_mark_type ITTAPI __itt_mark_createA(const char    *name);
+__itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_create     __itt_mark_createW
+#  define __itt_mark_create_ptr __itt_mark_createW_ptr
+#else /* UNICODE */
+#  define __itt_mark_create     __itt_mark_createA
+#  define __itt_mark_create_ptr __itt_mark_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_mark_type ITTAPI __itt_mark_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA     ITTNOTIFY_DATA(mark_createA)
+#define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA)
+#define __itt_mark_createW     ITTNOTIFY_DATA(mark_createW)
+#define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create      ITTNOTIFY_DATA(mark_create)
+#define __itt_mark_create_ptr  ITTNOTIFY_NAME(mark_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA(name) (__itt_mark_type)0
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW(name) (__itt_mark_type)0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create(name)  (__itt_mark_type)0
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string.
+ *
+ * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign.
+ * - The call is "synchronous" - function returns after mark is actually added to results.
+ * - This function is useful, for example, to mark different phases of application
+ *   (beginning of the next mark automatically meand end of current region).
+ * - Can be used together with "continuous" marks (see below) at the same collection session
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @param[in] parameter - string parameter of mark
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_markA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark     __itt_markW
+#  define __itt_mark_ptr __itt_markW_ptr
+#else /* UNICODE  */
+#  define __itt_mark     __itt_markA
+#  define __itt_mark_ptr __itt_markA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA     ITTNOTIFY_DATA(markA)
+#define __itt_markA_ptr ITTNOTIFY_NAME(markA)
+#define __itt_markW     ITTNOTIFY_DATA(markW)
+#define __itt_markW_ptr ITTNOTIFY_NAME(markW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark      ITTNOTIFY_DATA(mark)
+#define __itt_mark_ptr  ITTNOTIFY_NAME(mark)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA(mt, parameter) (int)0
+#define __itt_markA_ptr 0
+#define __itt_markW(mt, parameter) (int)0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark(mt, parameter)  (int)0
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA_ptr 0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create a "discrete" user event type (mark) for process
+ * rather then for one thread
+ * @see int __itt_mark(__itt_mark_type mt, const char* parameter);
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_global     __itt_mark_globalW
+#  define __itt_mark_global_ptr __itt_mark_globalW_ptr
+#else /* UNICODE  */
+#  define __itt_mark_global     __itt_mark_globalA
+#  define __itt_mark_global_ptr __itt_mark_globalA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark_global,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA     ITTNOTIFY_DATA(mark_globalA)
+#define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA)
+#define __itt_mark_globalW     ITTNOTIFY_DATA(mark_globalW)
+#define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global      ITTNOTIFY_DATA(mark_global)
+#define __itt_mark_global_ptr  ITTNOTIFY_NAME(mark_global)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA(mt, parameter) (int)0
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW(mt, parameter) (int)0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global(mt, parameter)  (int)0
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates an "end" point for "continuous" mark with specified name.
+ *
+ * - Returns zero value in case of success, non-zero value otherwise.
+ *   Also returns non-zero value when preceding "begin" point for the
+ *   mark with the same name failed to be created or not created.
+ * - The mark of "continuous" type is placed to collection results in
+ *   case of success. It appears in overtime view(s) as a special tick
+ *   sign (different from "discrete" mark) together with line from
+ *   corresponding "begin" mark to "end" mark.
+ * @note Continuous marks can overlap and be nested inside each other.
+ * Discrete mark can be nested inside marked region
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+int ITTAPI __itt_mark_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt))
+#define __itt_mark_off     ITTNOTIFY_DATA(mark_off)
+#define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_off(mt) (int)0
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create an "end" point for mark of process
+ * @see int __itt_mark_off(__itt_mark_type mt);
+ */
+int ITTAPI __itt_mark_global_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
+#define __itt_mark_global_off     ITTNOTIFY_DATA(mark_global_off)
+#define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_global_off(mt) (int)0
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} marks group */
+
+/**
+ * @defgroup counters_internal Counters
+ * @ingroup internal
+ * Counters group
+ * @{
+ */
+
+
+/**
+ * @defgroup stitch Stack Stitching
+ * @ingroup internal
+ * Stack Stitching group
+ * @{
+ */
+/**
+ * @brief opaque structure for counter identification
+ */
+typedef struct ___itt_caller *__itt_caller;
+
+/**
+ * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to.
+ * The function returns a unique identifier which is used to match the cut points with corresponding stitch points.
+ */
+__itt_caller ITTAPI __itt_stack_caller_create(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
+#define __itt_stack_caller_create     ITTNOTIFY_DATA(stack_caller_create)
+#define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_create() (__itt_caller)0
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the inforamtion about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ */
+void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id))
+#define __itt_stack_caller_destroy     ITTNOTIFY_VOID(stack_caller_destroy)
+#define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_destroy(id)
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Sets the cut point. Stack from each event which occurs after this call will be cut
+ * at the same stack level the function was called and stitched to the corresponding stitch point.
+ */
+void ITTAPI __itt_stack_callee_enter(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id))
+#define __itt_stack_callee_enter     ITTNOTIFY_VOID(stack_callee_enter)
+#define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_enter(id)
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter().
+ */
+void ITTAPI __itt_stack_callee_leave(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id))
+#define __itt_stack_callee_leave     ITTNOTIFY_VOID(stack_callee_leave)
+#define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_leave(id)
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} stitch group */
+
+/* ***************************************************************************************************************************** */
+
+#include <stdarg.h>
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_error_code
+{
+    __itt_error_success       = 0, /*!< no error */
+    __itt_error_no_module     = 1, /*!< module can't be loaded */
+    /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */
+    __itt_error_no_symbol     = 2, /*!< symbol not found */
+    /* %1$s -- library name, %2$s -- symbol name. */
+    __itt_error_unknown_group = 3, /*!< unknown group specified */
+    /* %1$s -- env var name, %2$s -- group name. */
+    __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
+    /* %1$s -- env var name, %2$d -- system error. */
+    __itt_error_env_too_long  = 5, /*!< variable value too long */
+    /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */
+    __itt_error_system        = 6  /*!< pthread_mutexattr_init or pthread_mutex_init failed */
+    /* %1$s -- function name, %2$d -- errno. */
+} __itt_error_code;
+
+typedef void (__itt_error_handler_t)(__itt_error_code code, va_list);
+__itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*);
+
+const char* ITTAPI __itt_api_version(void);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#define __itt_error_handler ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, error_handler)
+void __itt_error_handler(__itt_error_code code, va_list args);
+extern const int ITTNOTIFY_NAME(err);
+#define __itt_err ITTNOTIFY_NAME(err)
+ITT_STUB(ITTAPI, const char*, api_version, (void))
+#define __itt_api_version     ITTNOTIFY_DATA(api_version)
+#define __itt_api_version_ptr ITTNOTIFY_NAME(api_version)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_api_version()   (const char*)0
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_PRIVATE_ */
+
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/final/runtime/src/thirdparty/ittnotify/ittnotify_config.h
new file mode 100644
index 0000000..cc494cb
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_config.h
@@ -0,0 +1,587 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ITTNOTIFY_CONFIG_H_
+#define _ITTNOTIFY_CONFIG_H_
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifndef ITT_ARCH_IA32
+#  define ITT_ARCH_IA32  1
+#endif /* ITT_ARCH_IA32 */
+
+#ifndef ITT_ARCH_IA32E
+#  define ITT_ARCH_IA32E 2
+#endif /* ITT_ARCH_IA32E */
+
+/* Was there a magical reason we didn't have 3 here before? */
+#ifndef ITT_ARCH_AARCH64
+#  define ITT_ARCH_AARCH64  3
+#endif /* ITT_ARCH_AARCH64 */
+
+#ifndef ITT_ARCH_ARM
+#  define ITT_ARCH_ARM  4
+#endif /* ITT_ARCH_ARM */
+
+#ifndef ITT_ARCH_PPC64
+#  define ITT_ARCH_PPC64  5
+#endif /* ITT_ARCH_PPC64 */
+
+#ifndef ITT_ARCH_MIPS
+#  define ITT_ARCH_MIPS  6
+#endif /* ITT_ARCH_MIPS */
+
+#ifndef ITT_ARCH_MIPS64
+#  define ITT_ARCH_MIPS64  6
+#endif /* ITT_ARCH_MIPS64 */
+
+#ifndef ITT_ARCH
+#  if defined _M_IX86 || defined __i386__
+#    define ITT_ARCH ITT_ARCH_IA32
+#  elif defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#    define ITT_ARCH ITT_ARCH_IA32E
+#  elif defined _M_IA64 || defined __ia64__
+#    define ITT_ARCH ITT_ARCH_IA64
+#  elif defined _M_ARM || defined __arm__
+#    define ITT_ARCH ITT_ARCH_ARM
+#  elif defined __powerpc64__
+#    define ITT_ARCH ITT_ARCH_PPC64
+#  elif defined __aarch64__
+#    define ITT_ARCH ITT_ARCH_AARCH64
+#  elif defined __mips__ && !defined __mips64
+#    define ITT_ARCH ITT_ARCH_MIPS
+#  elif defined __mips__ && defined __mips64
+#    define ITT_ARCH ITT_ARCH_MIPS64
+#  endif
+#endif
+
+#ifdef __cplusplus
+#  define ITT_EXTERN_C extern "C"
+#  define ITT_EXTERN_C_BEGIN extern "C" {
+#  define ITT_EXTERN_C_END }
+#else
+#  define ITT_EXTERN_C /* nothing */
+#  define ITT_EXTERN_C_BEGIN /* nothing */
+#  define ITT_EXTERN_C_END /* nothing */
+#endif /* __cplusplus */
+
+#define ITT_TO_STR_AUX(x) #x
+#define ITT_TO_STR(x)     ITT_TO_STR_AUX(x)
+
+#define __ITT_BUILD_ASSERT(expr, suffix) do { \
+    static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \
+    __itt_build_check_##suffix[0] = 0; \
+} while(0)
+#define _ITT_BUILD_ASSERT(expr, suffix)  __ITT_BUILD_ASSERT((expr), suffix)
+#define ITT_BUILD_ASSERT(expr)           _ITT_BUILD_ASSERT((expr), __LINE__)
+
+#define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
+
+/* Replace with snapshot date YYYYMMDD for promotion build. */
+#define API_VERSION_BUILD    20151119
+
+#ifndef API_VERSION_NUM
+#define API_VERSION_NUM 0.0.0
+#endif /* API_VERSION_NUM */
+
+#define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
+                                " (" ITT_TO_STR(API_VERSION_BUILD) ")"
+
+/* OS communication functions */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <windows.h>
+typedef HMODULE           lib_t;
+typedef DWORD             TIDT;
+typedef CRITICAL_SECTION  mutex_t;
+#define MUTEX_INITIALIZER { 0 }
+#define strong_alias(name, aliasname) /* empty for Windows */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <dlfcn.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */
+#endif /* _GNU_SOURCE */
+#ifndef __USE_UNIX98
+#define __USE_UNIX98 1 /* need for PTHREAD_MUTEX_RECURSIVE, on SLES11.1 with gcc 4.3.4 wherein pthread.h missing dependency on __USE_XOPEN2K8 */
+#endif /*__USE_UNIX98*/
+#include <pthread.h>
+typedef void*             lib_t;
+typedef pthread_t         TIDT;
+typedef pthread_mutex_t   mutex_t;
+#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#define _strong_alias(name, aliasname) \
+            extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+#define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_get_proc(lib, name) GetProcAddress(lib, name)
+#define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
+#define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
+#define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
+#define __itt_load_lib(name)      LoadLibraryA(name)
+#define __itt_unload_lib(handle)  FreeLibrary(handle)
+#define __itt_system_error()      (int)GetLastError()
+#define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
+#define __itt_fstrnlen(s, l)      strnlen_s(s, l)
+#define __itt_fstrcpyn(s1, b, s2, l) strncpy_s(s1, b, s2, l)
+#define __itt_fstrdup(s)          _strdup(s)
+#define __itt_thread_id()         GetCurrentThreadId()
+#define __itt_thread_yield()      SwitchToThread()
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return InterlockedIncrement(ptr);
+}
+#endif /* ITT_SIMPLE_INIT */
+
+#define DL_SYMBOLS (1)
+#define PTHREAD_SYMBOLS (1)
+
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#define __itt_get_proc(lib, name) dlsym(lib, name)
+#define __itt_mutex_init(mutex)   {\
+    pthread_mutexattr_t mutex_attr;                                         \
+    int error_code = pthread_mutexattr_init(&mutex_attr);                   \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_init",    \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_settype(&mutex_attr,                     \
+                                           PTHREAD_MUTEX_RECURSIVE);        \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \
+                           error_code);                                     \
+    error_code = pthread_mutex_init(mutex, &mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutex_init",        \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_destroy(&mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \
+                           error_code);                                     \
+}
+#define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
+#define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
+#define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
+#define __itt_unload_lib(handle)  dlclose(handle)
+#define __itt_system_error()      errno
+#define __itt_fstrcmp(s1, s2)     strcmp(s1, s2)
+
+/* makes customer code define safe APIs for SDL_STRNLEN_S and SDL_STRNCPY_S */
+#ifdef SDL_STRNLEN_S
+#define __itt_fstrnlen(s, l)      SDL_STRNLEN_S(s, l)
+#else
+#define __itt_fstrnlen(s, l)      strlen(s)
+#endif /* SDL_STRNLEN_S */
+#ifdef SDL_STRNCPY_S
+#define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l)
+#else
+#define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, l)
+#endif /* SDL_STRNCPY_S */
+
+#define __itt_fstrdup(s)          strdup(s)
+#define __itt_thread_id()         pthread_self()
+#define __itt_thread_yield()      sched_yield()
+#if ITT_ARCH==ITT_ARCH_IA64
+#ifdef __INTEL_COMPILER
+#define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
+#else  /* __INTEL_COMPILER */
+/* TODO: Add Support for not Intel compilers for IA-64 architecture */
+#endif /* __INTEL_COMPILER */
+#elif ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */
+ITT_INLINE long
+__TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
+{
+    long result;
+    __asm__ __volatile__("lock\nxadd %0,%1"
+                          : "=r"(result),"=m"(*(volatile int*)ptr)
+                          : "0"(addend), "m"(*(volatile int*)ptr)
+                          : "memory");
+    return result;
+}
+#elif ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_PPC64 || ITT_ARCH==ITT_ARCH_AARCH64 || ITT_ARCH==ITT_ARCH_MIPS ||  ITT_ARCH==ITT_ARCH_MIPS64
+#define __TBB_machine_fetchadd4(addr, val) __sync_fetch_and_add(addr, val)
+#endif /* ITT_ARCH==ITT_ARCH_IA64 */
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return __TBB_machine_fetchadd4(ptr, 1) + 1L;
+}
+#endif /* ITT_SIMPLE_INIT */
+
+void* dlopen(const char*, int) __attribute__((weak));
+void* dlsym(void*, const char*) __attribute__((weak));
+int dlclose(void*) __attribute__((weak));
+#define DL_SYMBOLS (dlopen && dlsym && dlclose)
+
+int pthread_mutex_init(pthread_mutex_t*, const pthread_mutexattr_t*) __attribute__((weak));
+int pthread_mutex_lock(pthread_mutex_t*) __attribute__((weak));
+int pthread_mutex_unlock(pthread_mutex_t*) __attribute__((weak));
+int pthread_mutex_destroy(pthread_mutex_t*) __attribute__((weak));
+int pthread_mutexattr_init(pthread_mutexattr_t*) __attribute__((weak));
+int pthread_mutexattr_settype(pthread_mutexattr_t*, int) __attribute__((weak));
+int pthread_mutexattr_destroy(pthread_mutexattr_t*) __attribute__((weak));
+pthread_t pthread_self(void) __attribute__((weak));
+#define PTHREAD_SYMBOLS (pthread_mutex_init && pthread_mutex_lock && pthread_mutex_unlock && pthread_mutex_destroy && pthread_mutexattr_init && pthread_mutexattr_settype && pthread_mutexattr_destroy && pthread_self)
+
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+typedef enum {
+    __itt_collection_normal = 0,
+    __itt_collection_paused = 1
+} __itt_collection_state;
+
+typedef enum {
+    __itt_thread_normal  = 0,
+    __itt_thread_ignored = 1
+} __itt_thread_state;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_thread_info
+{
+    const char* nameA; /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    TIDT               tid;
+    __itt_thread_state state;   /*!< Thread state (paused or normal) */
+    int                extra1;  /*!< Reserved to the runtime */
+    void*              extra2;  /*!< Reserved to the runtime */
+    struct ___itt_thread_info* next;
+} __itt_thread_info;
+
+#include "ittnotify_types.h" /* For __itt_group_id definition */
+
+typedef struct ___itt_api_info_20101001
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    __itt_group_id group;
+}  __itt_api_info_20101001;
+
+typedef struct ___itt_api_info
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    void*          null_func;
+    __itt_group_id group;
+}  __itt_api_info;
+
+typedef struct __itt_counter_info
+{
+    const char* nameA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    const char* domainA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* domainW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* domainW;
+#endif /* UNICODE || _UNICODE */
+    int type;
+    long index;
+    int   extra1; /*!< Reserved to the runtime */
+    void* extra2; /*!< Reserved to the runtime */
+    struct __itt_counter_info* next;
+}  __itt_counter_info_t;
+
+struct ___itt_domain;
+struct ___itt_string_handle;
+
+typedef struct ___itt_global
+{
+    unsigned char          magic[8];
+    unsigned long          version_major;
+    unsigned long          version_minor;
+    unsigned long          version_build;
+    volatile long          api_initialized;
+    volatile long          mutex_initialized;
+    volatile long          atomic_counter;
+    mutex_t                mutex;
+    lib_t                  lib;
+    void*                  error_handler;
+    const char**           dll_path_ptr;
+    __itt_api_info*        api_list_ptr;
+    struct ___itt_global*  next;
+    /* Joinable structures below */
+    __itt_thread_info*     thread_list;
+    struct ___itt_domain*  domain_list;
+    struct ___itt_string_handle* string_list;
+    __itt_collection_state state;
+    __itt_counter_info_t* counter_list;
+} __itt_global;
+
+#pragma pack(pop)
+
+#define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = NULL; \
+        h->nameW  = n ? _wcsdup(n) : NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = n ? __itt_fstrdup(n) : NULL; \
+        h->nameW  = NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_W(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 1;    /* domain is enabled by default */ \
+        h->nameA  = NULL; \
+        h->nameW  = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_A(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 1;    /* domain is enabled by default */ \
+        h->nameA  = name ? __itt_fstrdup(name) : NULL; \
+        h->nameW  = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = NULL; \
+        h->strW   = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = name ? __itt_fstrdup(name) : NULL; \
+        h->strW   = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_W(gptr,h,h_tail,name,domain,type) { \
+    h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
+    if (h != NULL) { \
+        h->nameA   = NULL; \
+        h->nameW   = name ? _wcsdup(name) : NULL; \
+        h->domainA   = NULL; \
+        h->domainW   = name ? _wcsdup(domain) : NULL; \
+        h->type = type; \
+        h->index = 0; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_A(gptr,h,h_tail,name,domain,type) { \
+    h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
+    if (h != NULL) { \
+        h->nameA   = name ? __itt_fstrdup(name) : NULL; \
+        h->nameW   = NULL; \
+        h->domainA   = domain ? __itt_fstrdup(domain) : NULL; \
+        h->domainW   = NULL; \
+        h->type = type; \
+        h->index = 0; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#endif /* _ITTNOTIFY_CONFIG_H_ */
diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_static.c b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.c
new file mode 100644
index 0000000..a2a73ad
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.c
@@ -0,0 +1,1201 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_config.h"
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if defined(__MINGW32__)
+#include <limits.h>
+#else
+#define PATH_MAX 512
+#endif
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#include <limits.h>
+#include <dlfcn.h>
+#include <errno.h>
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define INTEL_NO_MACRO_BODY
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#include "ittnotify.h"
+#include "legacy/ittnotify.h"
+
+#if KMP_MSVC_COMPAT
+#include "disable_warnings.h"
+#endif
+
+static const char api_version[] = API_VERSION "\0\n@(#) $Revision: 481659 $\n";
+
+#define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+
+#if ITT_OS==ITT_OS_WIN
+static const char* ittnotify_lib_name = "libittnotify.dll";
+#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD
+static const char* ittnotify_lib_name = "libittnotify.so";
+#elif ITT_OS==ITT_OS_MAC
+static const char* ittnotify_lib_name = "libittnotify.dylib";
+#else
+#error Unsupported or unknown OS.
+#endif
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+
+#ifdef ITT_ANDROID_LOG
+    #define ITT_ANDROID_LOG_TAG   "INTEL_VTUNE_USERAPI"
+    #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+#else
+    #define ITT_ANDROID_LOGI(...)
+    #define ITT_ANDROID_LOGW(...)
+    #define ITT_ANDROID_LOGE(...)
+    #define ITT_ANDROID_LOGD(...)
+#endif
+
+/* default location of userapi collector on Android */
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(x)  "/data/data/com.intel.vtune/perfrun/lib" \
+                                                #x "/runtime/libittnotify.so"
+
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(32)
+#else
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(64)
+#endif
+
+#endif
+
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+
+#ifndef LIB_VAR_NAME
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_MIPS
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY32
+#else
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY64
+#endif
+#endif /* LIB_VAR_NAME */
+
+#define ITT_MUTEX_INIT_AND_LOCK(p) {                                 \
+    if (PTHREAD_SYMBOLS)                                             \
+    {                                                                \
+        if (!p.mutex_initialized)                                    \
+        {                                                            \
+            if (__itt_interlocked_increment(&p.atomic_counter) == 1) \
+            {                                                        \
+                __itt_mutex_init(&p.mutex);                          \
+                p.mutex_initialized = 1;                             \
+            }                                                        \
+            else                                                     \
+                while (!p.mutex_initialized)                         \
+                    __itt_thread_yield();                            \
+        }                                                            \
+        __itt_mutex_lock(&p.mutex);                                  \
+    }                                                                \
+}
+
+typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id);
+
+/* this define used to control initialization function name. */
+#ifndef __itt_init_ittlib_name
+ITT_EXTERN_C int _N_(init_ittlib)(const char*, __itt_group_id);
+static __itt_init_ittlib_t* __itt_init_ittlib_ptr = _N_(init_ittlib);
+#define __itt_init_ittlib_name __itt_init_ittlib_ptr
+#endif /* __itt_init_ittlib_name */
+
+typedef void (__itt_fini_ittlib_t)(void);
+
+/* this define used to control finalization function name. */
+#ifndef __itt_fini_ittlib_name
+ITT_EXTERN_C void _N_(fini_ittlib)(void);
+static __itt_fini_ittlib_t* __itt_fini_ittlib_ptr = _N_(fini_ittlib);
+#define __itt_fini_ittlib_name __itt_fini_ittlib_ptr
+#endif /* __itt_fini_ittlib_name */
+
+/* building pointers to imported funcs */
+#undef ITT_STUBV
+#undef ITT_STUB
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    __itt_init_ittlib_name(NULL, __itt_group_all);             \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        return ITTNOTIFY_NAME(name) params;                    \
+    else                                                       \
+        return (type)0;                                        \
+}
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    __itt_init_ittlib_name(NULL, __itt_group_all);             \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        ITTNOTIFY_NAME(name) params;                           \
+    else                                                       \
+        return;                                                \
+}
+
+#undef __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+
+ITT_GROUP_LIST(group_list);
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_alias
+{
+    const char*    env_var;
+    __itt_group_id groups;
+} __itt_group_alias;
+
+static __itt_group_alias group_alias[] = {
+    { "KMP_FOR_TPROFILE", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_mark) },
+    { "KMP_FOR_TCHECK",   (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_fsync | __itt_group_mark | __itt_group_suppress) },
+    { NULL,               (__itt_group_none) },
+    { api_version,        (__itt_group_none) } /* !!! Just to avoid unused code elimination !!! */
+};
+
+#pragma pack(pop)
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static __itt_api_info api_list[] = {
+/* Define functions with static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) { ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+/* Define functions without static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) {ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), NULL, (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#include "ittnotify_static.h"
+    {NULL, NULL, NULL, NULL, __itt_group_none}
+};
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static const char dll_path[PATH_MAX] = { 0 };
+
+/* static part descriptor which handles. all notification api attributes. */
+__itt_global _N_(_ittapi_global) = {
+    ITT_MAGIC,                                     /* identification info */
+    ITT_MAJOR, ITT_MINOR, API_VERSION_BUILD,       /* version info */
+    0,                                             /* api_initialized */
+    0,                                             /* mutex_initialized */
+    0,                                             /* atomic_counter */
+    MUTEX_INITIALIZER,                             /* mutex */
+    NULL,                                          /* dynamic library handle */
+    NULL,                                          /* error_handler */
+    (const char**)&dll_path,                       /* dll_path_ptr */
+    (__itt_api_info*)&api_list,                    /* api_list_ptr */
+    NULL,                                          /* next __itt_global */
+    NULL,                                          /* thread_list */
+    NULL,                                          /* domain_list */
+    NULL,                                          /* string_list */
+    __itt_collection_normal,                       /* collection state */
+    NULL                                          /* counter_list */
+};
+
+typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id);
+typedef void (__itt_api_fini_t)(__itt_global*);
+
+/* ========================================================================= */
+
+#ifdef ITT_NOTIFY_EXT_REPORT
+ITT_EXTERN_C void _N_(error_handler)(__itt_error_code, va_list args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void __itt_report_error(unsigned code_arg, ...)
+{
+    va_list args;
+    va_start(args, code_arg);
+
+    // We use unsigned for the code argument and explicitly cast it here to the
+    // right enumerator because variadic functions are not compatible with
+    // default promotions.
+    __itt_error_code code = (__itt_error_code)code_arg;
+
+    if (_N_(_ittapi_global).error_handler != NULL)
+    {
+        __itt_error_handler_t* handler = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+        handler(code, args);
+    }
+#ifdef ITT_NOTIFY_EXT_REPORT
+    _N_(error_handler)(code, args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+    va_end(args);
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))(const wchar_t* name)
+{
+    __itt_domain *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(domain_createW) && ITTNOTIFY_NAME(domain_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_createW)(name);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL && !wcscmp(h->nameW, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_DOMAIN_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_domain *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(domain_createA) && ITTNOTIFY_NAME(domain_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_createA)(name);
+        }
+#else
+        if (ITTNOTIFY_NAME(domain_create) && ITTNOTIFY_NAME(domain_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_create)(name);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_DOMAIN_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))(const wchar_t* name)
+{
+    __itt_string_handle *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(string_handle_createW) && ITTNOTIFY_NAME(string_handle_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_createW)(name);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->strW != NULL && !wcscmp(h->strW, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_STRING_HANDLE_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_string_handle *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(string_handle_createA) && ITTNOTIFY_NAME(string_handle_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_createA)(name);
+        }
+#else
+        if (ITTNOTIFY_NAME(string_handle_create) && ITTNOTIFY_NAME(string_handle_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_create)(name);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->strA != NULL && !__itt_fstrcmp(h->strA, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_STRING_HANDLE_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init))(const wchar_t *name, const wchar_t *domain)
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+    __itt_metadata_type type = __itt_metadata_u64;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(counter_createW) && ITTNOTIFY_NAME(counter_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createW)(name, domain);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
+            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
+
+    }
+    if (h == NULL)
+    {
+        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init))(const char *name, const char *domain)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init))(const char *name, const char *domain)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+    __itt_metadata_type type = __itt_metadata_u64;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(counter_createA) && ITTNOTIFY_NAME(counter_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createA)(name, domain);
+        }
+#else
+        if (ITTNOTIFY_NAME(counter_create) && ITTNOTIFY_NAME(counter_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create)(name, domain);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
+            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
+    }
+    if (h == NULL)
+    {
+       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init))(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type)
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(counter_create_typedW) && ITTNOTIFY_NAME(counter_create_typedW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typedW)(name, domain, type);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
+            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
+
+    }
+    if (h == NULL)
+    {
+        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init))(const char *name, const char *domain, __itt_metadata_type type)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init))(const char *name, const char *domain, __itt_metadata_type type)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(counter_create_typedA) && ITTNOTIFY_NAME(counter_create_typedA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typedA)(name, domain, type);
+        }
+#else
+        if (ITTNOTIFY_NAME(counter_create_typed) && ITTNOTIFY_NAME(counter_create_typed) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typed)(name, domain, type);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
+            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
+    }
+    if (h == NULL)
+    {
+       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(pause) && ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init)))
+    {
+        ITTNOTIFY_NAME(pause)();
+    }
+    else
+    {
+        _N_(_ittapi_global).state = __itt_collection_paused;
+    }
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(resume) && ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init)))
+    {
+        ITTNOTIFY_NAME(resume)();
+    }
+    else
+    {
+        _N_(_ittapi_global).state = __itt_collection_normal;
+    }
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(thread_set_nameW) && ITTNOTIFY_NAME(thread_set_nameW) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_nameW)(name);
+    }
+}
+
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name);
+    return 0;
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(const char* name)
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    if (ITTNOTIFY_NAME(thread_set_nameA) && ITTNOTIFY_NAME(thread_set_nameA) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_nameA)(name);
+    }
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    if (ITTNOTIFY_NAME(thread_set_name) && ITTNOTIFY_NAME(thread_set_name) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_name)(name);
+    }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name);
+    return 0;
+}
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name);
+    return 0;
+}
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(thread_ignore) && ITTNOTIFY_NAME(thread_ignore) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init)))
+    {
+        ITTNOTIFY_NAME(thread_ignore)();
+    }
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore),_init))(void)
+{
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))();
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach),_init))(void)
+{
+#ifdef __ANDROID__
+    /*
+     * if LIB_VAR_NAME env variable were set before then stay previous value
+     * else set default path
+    */
+    setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0);
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+
+static const char* __itt_fsplit(const char* s, const char* sep, const char** out, int* len)
+{
+    int i;
+    int j;
+
+    if (!s || !sep || !out || !len)
+        return NULL;
+
+    for (i = 0; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    if (!s[i])
+        return NULL;
+
+    *len = 0;
+    *out = &s[i];
+
+    for (; s[i]; i++, (*len)++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (b)
+            break;
+    }
+
+    for (; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    return &s[i];
+}
+
+/* This function return value of env variable that placed into static buffer.
+ * !!! The same static buffer is used for subsequent calls. !!!
+ * This was done to aviod dynamic allocation for few calls.
+ * Actually we need this function only four times.
+ */
+static const char* __itt_get_env_var(const char* name)
+{
+#define MAX_ENV_VALUE_SIZE 4086
+    static char  env_buff[MAX_ENV_VALUE_SIZE];
+    static char* env_value = (char*)env_buff;
+
+    if (name != NULL)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+        DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len);
+        if (rc >= max_len)
+            __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1, (size_t)(max_len - 1));
+        else if (rc > 0)
+        {
+            const char* ret = (const char*)env_value;
+            env_value += rc + 1;
+            return ret;
+        }
+        else
+        {
+            /* If environment variable is empty, GetEnvirornmentVariables()
+             * returns zero (number of characters (not including terminating null),
+             * and GetLastError() returns ERROR_SUCCESS. */
+            DWORD err = GetLastError();
+            if (err == ERROR_SUCCESS)
+                return env_value;
+
+            if (err != ERROR_ENVVAR_NOT_FOUND)
+                __itt_report_error(__itt_error_cant_read_env, name, (int)err);
+        }
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+        char* env = getenv(name);
+        if (env != NULL)
+        {
+            size_t len = __itt_fstrnlen(env, MAX_ENV_VALUE_SIZE);
+            size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+            if (len < max_len)
+            {
+                const char* ret = (const char*)env_value;
+                __itt_fstrcpyn(env_value, max_len, env, len + 1);
+                env_value += len + 1;
+                return ret;
+            } else
+                __itt_report_error(__itt_error_env_too_long, name, (size_t)len, (size_t)(max_len - 1));
+        }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+    return NULL;
+}
+
+static const char* __itt_get_lib_name(void)
+{
+    const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+
+#ifdef __ANDROID__
+    if (lib_name == NULL)
+    {
+
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+        const char* const marker_filename = "com.intel.itt.collector_lib_32";
+#else
+        const char* const marker_filename = "com.intel.itt.collector_lib_64";
+#endif
+
+        char system_wide_marker_filename[PATH_MAX] = {0};
+        int itt_marker_file_fd = -1;
+        ssize_t res = 0;
+
+        res = snprintf(system_wide_marker_filename, PATH_MAX - 1, "%s%s", "/data/local/tmp/", marker_filename);
+        if (res < 0)
+        {
+            ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+            return lib_name;
+        }
+        itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
+
+        if (itt_marker_file_fd == -1)
+        {
+            const pid_t my_pid = getpid();
+            char cmdline_path[PATH_MAX] = {0};
+            char package_name[PATH_MAX] = {0};
+            char app_sandbox_file[PATH_MAX] = {0};
+            int cmdline_fd = 0;
+
+            ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
+            res = snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
+            if (res < 0)
+            {
+                ITT_ANDROID_LOGE("Unable to get cmdline path string.");
+                return lib_name;
+            }
+
+            ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
+            cmdline_fd = open(cmdline_path, O_RDONLY);
+            if (cmdline_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
+                return lib_name;
+            }
+            res = read(cmdline_fd, package_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
+                res = close(cmdline_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                }
+                return lib_name;
+            }
+            res = close(cmdline_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Package name: %s\n", package_name);
+            res = snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/%s", package_name, marker_filename);
+            if (res < 0)
+            {
+                ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+                return lib_name;
+            }
+
+            ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
+            itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
+            if (itt_marker_file_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open app marker file!");
+                return lib_name;
+            }
+        }
+
+        {
+            char itt_lib_name[PATH_MAX] = {0};
+
+            res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
+                res = close(itt_marker_file_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                }
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
+            res = close(itt_marker_file_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Set env %s to %s", ITT_TO_STR(LIB_VAR_NAME), itt_lib_name);
+            res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to set env var!");
+                return lib_name;
+            }
+            lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+            ITT_ANDROID_LOGI("ITT Lib path from env: %s", lib_name);
+        }
+    }
+#endif
+
+    return lib_name;
+}
+
+/* Avoid clashes with std::min, reported by tbb team */
+#define __itt_min(a,b) (a) < (b) ? (a) : (b)
+
+static __itt_group_id __itt_get_groups(void)
+{
+    int i;
+    __itt_group_id res = __itt_group_none;
+    const char* var_name  = "INTEL_ITTNOTIFY_GROUPS";
+    const char* group_str = __itt_get_env_var(var_name);
+
+    if (group_str != NULL)
+    {
+        int len;
+        char gr[255];
+        const char* chunk;
+        while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL)
+        {
+            int min_len = __itt_min(len, (int)(sizeof(gr) - 1));
+            __itt_fstrcpyn(gr, sizeof(gr) - 1, chunk,  min_len);
+            gr[min_len] = 0;
+
+            for (i = 0; group_list[i].name != NULL; i++)
+            {
+                if (!__itt_fstrcmp(gr, group_list[i].name))
+                {
+                    res = (__itt_group_id)(res | group_list[i].id);
+                    break;
+                }
+            }
+        }
+        /* TODO: !!! Workaround for bug with warning for unknown group !!!
+         * Should be fixed in new initialization scheme.
+         * Now the following groups should be set always. */
+        for (i = 0; group_list[i].id != __itt_group_none; i++)
+            if (group_list[i].id != __itt_group_all &&
+                group_list[i].id > __itt_group_splitter_min &&
+                group_list[i].id < __itt_group_splitter_max)
+                res = (__itt_group_id)(res | group_list[i].id);
+        return res;
+    }
+    else
+    {
+        for (i = 0; group_alias[i].env_var != NULL; i++)
+            if (__itt_get_env_var(group_alias[i].env_var) != NULL)
+                return group_alias[i].groups;
+    }
+
+    return res;
+}
+
+#undef __itt_min
+
+static int __itt_lib_version(lib_t lib)
+{
+    if (lib == NULL)
+        return 0;
+    if (__itt_get_proc(lib, "__itt_api_init"))
+        return 2;
+    if (__itt_get_proc(lib, "__itt_api_version"))
+        return 1;
+    return 0;
+}
+
+/* It's not used right now! Comment it out to avoid warnings.
+static void __itt_reinit_all_pointers(void)
+{
+    int i;
+    // Fill all pointers with initial stubs
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].init_func;
+}
+*/
+
+static void __itt_nullify_all_pointers(void)
+{
+    int i;
+    /* Nulify all pointers except domain_create, string_handle_create  and counter_create */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_EXTERN_C void _N_(fini_ittlib)(void)
+{
+    __itt_api_fini_t* __itt_api_fini_ptr = NULL;
+    static volatile TIDT current_thread = 0;
+
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+        if (_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
+                if (_N_(_ittapi_global).lib != NULL)
+                {
+                    __itt_api_fini_ptr = (__itt_api_fini_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_fini");
+                }
+                if (__itt_api_fini_ptr)
+                {
+                    __itt_api_fini_ptr(&_N_(_ittapi_global));
+                }
+
+                __itt_nullify_all_pointers();
+
+ /* TODO: !!! not safe !!! don't support unload so far.
+  *             if (_N_(_ittapi_global).lib != NULL)
+  *                 __itt_unload_lib(_N_(_ittapi_global).lib);
+  *             _N_(_ittapi_global).lib = NULL;
+  */
+                _N_(_ittapi_global).api_initialized = 0;
+                current_thread = 0;
+            }
+        }
+        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    }
+}
+
+ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups)
+{
+    int i;
+    __itt_group_id groups;
+#ifdef ITT_COMPLETE_GROUP
+    __itt_group_id zero_group = __itt_group_none;
+#endif /* ITT_COMPLETE_GROUP */
+    static volatile TIDT current_thread = 0;
+
+    if (!_N_(_ittapi_global).api_initialized)
+    {
+#ifndef ITT_SIMPLE_INIT
+        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+#endif /* ITT_SIMPLE_INIT */
+
+        if (!_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
+                if (lib_name == NULL)
+                {
+                    lib_name = __itt_get_lib_name();
+                }
+                groups = __itt_get_groups();
+                if (DL_SYMBOLS && (groups != __itt_group_none || lib_name != NULL))
+                {
+                    _N_(_ittapi_global).lib = __itt_load_lib((lib_name == NULL) ? ittnotify_lib_name : lib_name);
+
+                    if (_N_(_ittapi_global).lib != NULL)
+                    {
+                        __itt_api_init_t* __itt_api_init_ptr;
+                        int lib_version = __itt_lib_version(_N_(_ittapi_global).lib);
+
+                        switch (lib_version) {
+                        case 0:
+                            groups = __itt_group_legacy;
+                        case 1:
+                            /* Fill all pointers from dynamic library */
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                            {
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & groups & init_groups)
+                                {
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = (void*)__itt_get_proc(_N_(_ittapi_global).lib, _N_(_ittapi_global).api_list_ptr[i].name);
+                                    if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL)
+                                    {
+                                        /* Restore pointers for function with static implementation */
+                                        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                                        __itt_report_error(__itt_error_no_symbol, lib_name, _N_(_ittapi_global).api_list_ptr[i].name);
+#ifdef ITT_COMPLETE_GROUP
+                                        zero_group = (__itt_group_id)(zero_group | _N_(_ittapi_global).api_list_ptr[i].group);
+#endif /* ITT_COMPLETE_GROUP */
+                                    }
+                                }
+                                else
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                            }
+
+                            if (groups == __itt_group_legacy)
+                            {
+                                /* Compatibility with legacy tools */
+                                ITTNOTIFY_NAME(thread_ignore)  = ITTNOTIFY_NAME(thr_ignore);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                                ITTNOTIFY_NAME(sync_createA)   = ITTNOTIFY_NAME(sync_set_nameA);
+                                ITTNOTIFY_NAME(sync_createW)   = ITTNOTIFY_NAME(sync_set_nameW);
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_create)    = ITTNOTIFY_NAME(sync_set_name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_prepare)   = ITTNOTIFY_NAME(notify_sync_prepare);
+                                ITTNOTIFY_NAME(sync_cancel)    = ITTNOTIFY_NAME(notify_sync_cancel);
+                                ITTNOTIFY_NAME(sync_acquired)  = ITTNOTIFY_NAME(notify_sync_acquired);
+                                ITTNOTIFY_NAME(sync_releasing) = ITTNOTIFY_NAME(notify_sync_releasing);
+                            }
+
+#ifdef ITT_COMPLETE_GROUP
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group)
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+#endif /* ITT_COMPLETE_GROUP */
+                            break;
+                        case 2:
+                            __itt_api_init_ptr = (__itt_api_init_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_init");
+                            if (__itt_api_init_ptr)
+                                __itt_api_init_ptr(&_N_(_ittapi_global), init_groups);
+                            break;
+                        }
+                    }
+                    else
+                    {
+                        __itt_nullify_all_pointers();
+
+                        __itt_report_error(__itt_error_no_module, lib_name,
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                            __itt_system_error()
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                            dlerror()
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                        );
+                    }
+                }
+                else
+                {
+                    __itt_nullify_all_pointers();
+                }
+                _N_(_ittapi_global).api_initialized = 1;
+                current_thread = 0;
+                /* !!! Just to avoid unused code elimination !!! */
+                if (__itt_fini_ittlib_ptr == _N_(fini_ittlib)) current_thread = 0;
+            }
+        }
+
+#ifndef ITT_SIMPLE_INIT
+        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#endif /* ITT_SIMPLE_INIT */
+    }
+
+    /* Evaluating if any function ptr is non empty and it's in init_groups */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+    {
+        if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr != _N_(_ittapi_global).api_list_ptr[i].null_func &&
+            _N_(_ittapi_global).api_list_ptr[i].group & init_groups)
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+ITT_EXTERN_C __itt_error_handler_t* _N_(set_error_handler)(__itt_error_handler_t* handler)
+{
+    __itt_error_handler_t* prev = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+    _N_(_ittapi_global).error_handler = (void*)(size_t)handler;
+    return prev;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_static.h b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.h
new file mode 100644
index 0000000..a202226
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_static.h
@@ -0,0 +1,341 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "ittnotify_config.h"
+
+#ifndef ITT_FORMAT_DEFINED
+#  ifndef ITT_FORMAT
+#    define ITT_FORMAT
+#  endif /* ITT_FORMAT */
+#  ifndef ITT_NO_PARAMS
+#    define ITT_NO_PARAMS
+#  endif /* ITT_NO_PARAMS */
+#endif /* ITT_FORMAT_DEFINED */
+
+/*
+ * parameters for macro expected:
+ * ITT_STUB(api, type, func_name, arguments, params, func_name_in_dll, group, printf_fmt)
+ */
+#ifdef __ITT_INTERNAL_INIT
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name), (ITT_FORMAT name), domain_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name), (ITT_FORMAT name), domain_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name), (ITT_FORMAT name), domain_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name), (ITT_FORMAT name), string_handle_createA, __itt_group_structure, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name), (ITT_FORMAT name), string_handle_createW, __itt_group_structure, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name), (ITT_FORMAT name), string_handle_create,  __itt_group_structure, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_createA, __itt_group_counter, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), counter_createW, __itt_group_counter, "\"%s\", \"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), counter_create,  __itt_group_counter, "\"%s\", \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedA, __itt_group_counter, "\"%s\", \"%s\", %d")
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedW, __itt_group_counter, "\"%s\", \"%s\", %d")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char    *name, const char    *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typed,  __itt_group_counter, "\"%s\", \"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+
+ITT_STUBV(ITTAPI, void, pause,  (void), (ITT_NO_PARAMS), pause,  __itt_group_control | __itt_group_legacy, "no args")
+ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"")
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name), (ITT_FORMAT name), thread_set_nameW, __itt_group_thread, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name), (ITT_FORMAT name), thread_set_name,  __itt_group_thread, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_ignore, (void), (ITT_NO_PARAMS), thread_ignore, __itt_group_thread, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int,  thr_name_setA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setA, __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, int,  thr_name_setW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setW, __itt_group_thread | __itt_group_legacy, "\"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int,  thr_name_set,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), thr_name_set,  __itt_group_thread | __itt_group_legacy, "\"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(LIBITTAPI, void, thr_ignore,   (void),                             (ITT_NO_PARAMS),            thr_ignore,    __itt_group_thread | __itt_group_legacy, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args")
+
+#else  /* __ITT_INTERNAL_INIT */
+
+ITT_STUBV(ITTAPI, void, detach, (void), (ITT_NO_PARAMS), detach, __itt_group_control | __itt_group_legacy, "no args")
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\", \"%S\", %x")
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_renameA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name), (ITT_FORMAT addr, name), sync_renameW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_create,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name), (ITT_FORMAT addr, name), sync_rename,  __itt_group_sync | __itt_group_fsync, "%p, \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_destroy,    (void *addr), (ITT_FORMAT addr), sync_destroy,   __itt_group_sync | __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, sync_prepare,    (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_cancel,     (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_acquired,   (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_sync,  "%p")
+ITT_STUBV(ITTAPI, void, sync_releasing,  (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_sync,  "%p")
+
+ITT_STUBV(ITTAPI, void, suppress_push,       (unsigned int mask),                             (ITT_FORMAT mask), suppress_push,  __itt_group_suppress,  "%p")
+ITT_STUBV(ITTAPI, void, suppress_pop,        (void),                                          (ITT_NO_PARAMS),   suppress_pop,   __itt_group_suppress,  "no args")
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_mark_range, __itt_group_suppress, "%d, %p, %p, %d")
+ITT_STUBV(ITTAPI, void, suppress_clear_range,(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_clear_range,__itt_group_suppress, "%d, %p, %p, %d")
+
+ITT_STUBV(ITTAPI, void, fsync_prepare,   (void* addr), (ITT_FORMAT addr), sync_prepare,   __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_cancel,    (void *addr), (ITT_FORMAT addr), sync_cancel,    __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_acquired,  (void *addr), (ITT_FORMAT addr), sync_acquired,  __itt_group_fsync, "%p")
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_fsync, "%p")
+
+ITT_STUBV(ITTAPI, void, model_site_begin,          (__itt_model_site *site, __itt_model_site_instance *instance, const char *name), (ITT_FORMAT site, instance, name), model_site_begin, __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_end,            (__itt_model_site *site, __itt_model_site_instance *instance),                   (ITT_FORMAT site, instance),       model_site_end,   __itt_group_model, "%p, %p")
+ITT_STUBV(ITTAPI, void, model_task_begin,          (__itt_model_task *task, __itt_model_task_instance *instance, const char *name), (ITT_FORMAT task, instance, name), model_task_begin, __itt_group_model, "%p, %p, \"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_end,            (__itt_model_task *task, __itt_model_task_instance *instance),                   (ITT_FORMAT task, instance),       model_task_end,   __itt_group_model, "%p, %p")
+ITT_STUBV(ITTAPI, void, model_lock_acquire,        (void *lock), (ITT_FORMAT lock), model_lock_acquire, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release,        (void *lock), (ITT_FORMAT lock), model_lock_release, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size), (ITT_FORMAT addr, size), model_record_allocation,   __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr),              (ITT_FORMAT addr),       model_record_deallocation, __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_induction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_induction_uses,      __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_reduction_uses,      (void* addr, size_t size), (ITT_FORMAT addr, size), model_reduction_uses,      __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_observe_uses,        (void* addr, size_t size), (ITT_FORMAT addr, size), model_observe_uses,        __itt_group_model, "%p, %d")
+ITT_STUBV(ITTAPI, void, model_clear_uses,          (void* addr),              (ITT_FORMAT addr),       model_clear_uses,          __itt_group_model, "%p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_site_beginW,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_task_beginW,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskW,     (const wchar_t *name),     (ITT_FORMAT name),       model_iteration_taskW,     __itt_group_model, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, model_site_beginA,         (const char *name),        (ITT_FORMAT name),       model_site_beginA,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_site_beginAL,    __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_task_beginA,         (const char *name),        (ITT_FORMAT name),       model_task_beginA,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_task_beginAL,    __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,     (const char *name),        (ITT_FORMAT name),       model_iteration_taskA,     __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,    (const char *name, size_t len), (ITT_FORMAT name, len), model_iteration_taskAL, __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_site_end_2,          (void),                    (ITT_NO_PARAMS),         model_site_end_2,          __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_task_end_2,          (void),                    (ITT_NO_PARAMS),         model_task_end_2,          __itt_group_model, "no args")
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_acquire_2,      __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_lock_release_2,      (void *lock),              (ITT_FORMAT lock),       model_lock_release_2,      __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_aggregate_task,      (size_t count),            (ITT_FORMAT count),      model_aggregate_task,      __itt_group_model, "%d")
+ITT_STUBV(ITTAPI, void, model_disable_push,        (__itt_model_disable x),   (ITT_FORMAT x),          model_disable_push,        __itt_group_model, "%p")
+ITT_STUBV(ITTAPI, void, model_disable_pop,         (void),                    (ITT_NO_PARAMS),         model_disable_pop,         __itt_group_model, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_createA, __itt_group_heap, "\"%s\", \"%s\"")
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), heap_function_createW, __itt_group_heap, "\"%s\", \"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char    *name, const char    *domain), (ITT_FORMAT name, domain), heap_function_create,  __itt_group_heap, "\"%s\", \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, heap_allocate_begin,   (__itt_heap_function h, size_t size, int initialized),             (ITT_FORMAT h, size, initialized),       heap_allocate_begin, __itt_group_heap, "%p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_allocate_end,     (__itt_heap_function h, void** addr, size_t size, int initialized), (ITT_FORMAT h, addr, size, initialized), heap_allocate_end,   __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_free_begin,       (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_begin, __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_free_end,         (__itt_heap_function h, void*  addr), (ITT_FORMAT h, addr), heap_free_end,   __itt_group_heap, "%p, %p")
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void*  addr, size_t new_size, int initialized),                  (ITT_FORMAT h, addr, new_size, initialized),           heap_reallocate_begin, __itt_group_heap, "%p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_reallocate_end,   (__itt_heap_function h, void*  addr, void** new_addr, size_t new_size, int initialized), (ITT_FORMAT h, addr, new_addr, new_size, initialized), heap_reallocate_end,   __itt_group_heap, "%p, %p, %p, %lu, %d")
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void), (ITT_NO_PARAMS), heap_internal_access_begin, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_internal_access_end,   (void), (ITT_NO_PARAMS), heap_internal_access_end,   __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void), (ITT_NO_PARAMS), heap_record_memory_growth_begin, __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end,   (void), (ITT_NO_PARAMS), heap_record_memory_growth_end,   __itt_group_heap, "no args")
+ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask),  (ITT_FORMAT reset_mask), heap_reset_detection, __itt_group_heap, "%u")
+ITT_STUBV(ITTAPI, void, heap_record,          (unsigned int record_mask), (ITT_FORMAT record_mask),  heap_record,        __itt_group_heap, "%u")
+
+ITT_STUBV(ITTAPI, void, id_create,  (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_create,  __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_destroy, __itt_group_structure, "%p, %lu")
+
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void), (ITT_NO_PARAMS), get_timestamp,  __itt_group_structure, "no args")
+
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), region_begin, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id),                                             (ITT_FORMAT domain, id),               region_end,   __itt_group_structure, "%p, %lu")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_begin_v3,  __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_end_v3,    __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end), (ITT_FORMAT domain, id, begin, end), frame_submit_v3, __itt_group_structure, "%p, %p, %lu, %lu")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(ITTAPI, void, task_group,   (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_group,  __itt_group_structure, "%p, %lu, %lu, %p")
+
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_begin,    __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parent, void* fn),                  (ITT_FORMAT domain, id, parent, fn),   task_begin_fn, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain),                                                          (ITT_FORMAT domain),                   task_end,      __itt_group_structure, "%p")
+
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name),                           (ITT_FORMAT domain, name),        counter_inc_v3,       __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_inc_delta_v3, __itt_group_structure, "%p, %p, %lu")
+ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name),                           (ITT_FORMAT domain, name),        counter_dec_v3,       __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_dec_delta_v3, __itt_group_structure, "%p, %p, %lu")
+
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, id, name, scope), marker, __itt_group_structure, "%p, %lu, %p, %d")
+
+ITT_STUBV(ITTAPI, void, metadata_add,      (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, id, key, type, count, data), metadata_add, __itt_group_structure, "%p, %lu, %p, %d, %lu, %p")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_addA, __itt_group_structure, "%p, %lu, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t* data, size_t length), (ITT_FORMAT domain, id, key, data, length), metadata_str_addW, __itt_group_structure, "%p, %lu, %p, %p, %lu")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add,  (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length),    (ITT_FORMAT domain, id, key, data, length), metadata_str_add,  __itt_group_structure, "%p, %lu, %p, %p, %lu")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, relation, tail),       relation_add_to_current, __itt_group_structure, "%p, %lu, %p")
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, head, relation, tail), relation_add,            __itt_group_structure, "%p, %p, %lu, %p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_createA, __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), event_createW, __itt_group_mark | __itt_group_legacy, "\"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen), (ITT_FORMAT name, namelen), event_create,  __itt_group_mark | __itt_group_legacy, "\"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int,  event_start,          (__itt_event event),                (ITT_FORMAT event),         event_start,   __itt_group_mark | __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, int,  event_end,            (__itt_event event),                (ITT_FORMAT event),         event_end,     __itt_group_mark | __itt_group_legacy, "%d")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", \"%s\", %x")
+ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", \"%S\", %x")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "p, \"%s\", \"%s\", %x")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *p, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", %d, \"%S\", %d, %x")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *p, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_name,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare,   (void *p), (ITT_FORMAT p), notify_sync_prepare,   __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel,    (void *p), (ITT_FORMAT p), notify_sync_cancel,    __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired,  (void *p), (ITT_FORMAT p), notify_sync_acquired,  __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *p), (ITT_FORMAT p), notify_sync_releasing, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p")
+#endif /* __ITT_INTERNAL_BODY */
+
+ITT_STUBV(LIBITTAPI, void, memory_read,   (void *addr, size_t size), (ITT_FORMAT addr, size), memory_read,   __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_write,  (void *addr, size_t size), (ITT_FORMAT addr, size), memory_write,  __itt_group_legacy, "%p, %lu")
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size), (ITT_FORMAT addr, size), memory_update, __itt_group_legacy, "%p, %lu")
+
+ITT_STUB(LIBITTAPI, __itt_state_t,     state_get,    (void),                                    (ITT_NO_PARAMS),   state_get,    __itt_group_legacy, "no args")
+ITT_STUB(LIBITTAPI, __itt_state_t,     state_set,    (__itt_state_t s),                         (ITT_FORMAT s),    state_set,    __itt_group_legacy, "%d")
+ITT_STUB(LIBITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s), (ITT_FORMAT p, s), obj_mode_set, __itt_group_legacy, "%d, %d")
+ITT_STUB(LIBITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s), (ITT_FORMAT p, s), thr_mode_set, __itt_group_legacy, "%d, %d")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain), (ITT_FORMAT domain), frame_createA, __itt_group_frame, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain), (ITT_FORMAT domain), frame_createW, __itt_group_frame, "\"%s\"")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char    *domain), (ITT_FORMAT domain), frame_create,  __itt_group_frame, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, frame_begin,         (__itt_frame frame),     (ITT_FORMAT frame),  frame_begin,   __itt_group_frame, "%p")
+ITT_STUBV(ITTAPI, void, frame_end,           (__itt_frame frame),     (ITT_FORMAT frame),  frame_end,     __itt_group_frame, "%p")
+
+ITT_STUBV(ITTAPI, void, counter_destroy,      (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_destroy,   __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc,          (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_inc,       __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_inc_delta,    (__itt_counter id, unsigned long long value),                                                        (ITT_FORMAT id, value), counter_inc_delta, __itt_group_counter, "%p, %lu")
+ITT_STUBV(ITTAPI, void, counter_dec,          (__itt_counter id),                                                                                  (ITT_FORMAT id),        counter_dec,       __itt_group_counter, "%p")
+ITT_STUBV(ITTAPI, void, counter_dec_delta,    (__itt_counter id, unsigned long long value),                                                        (ITT_FORMAT id, value), counter_dec_delta, __itt_group_counter, "%p, %lu")
+ITT_STUBV(ITTAPI, void, counter_set_value,    (__itt_counter id, void *value_ptr),                                                                 (ITT_FORMAT id, value_ptr),                          counter_set_value,    __itt_group_counter, "%p, %p")
+ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr), (ITT_FORMAT id, clock_domain, timestamp, value_ptr), counter_set_value_ex, __itt_group_counter, "%p, %p, %llu, %p")
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name), (ITT_FORMAT name), mark_createA, __itt_group_mark, "\"%s\"")
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name), (ITT_FORMAT name), mark_createW, __itt_group_mark, "\"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char    *name), (ITT_FORMAT name), mark_create,  __itt_group_mark, "\"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int,  markA,        (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), markA, __itt_group_mark, "%d, \"%s\"")
+ITT_STUB(ITTAPI, int,  markW,        (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), markW, __itt_group_mark, "%d, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark,         (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark,  __itt_group_mark, "%d, \"%s\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_off, (__itt_mark_type mt), (ITT_FORMAT mt), mark_off, __itt_group_mark, "%d")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int,  mark_globalA, (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_globalA, __itt_group_mark, "%d, \"%s\"")
+ITT_STUB(ITTAPI, int,  mark_globalW, (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), mark_globalW, __itt_group_mark, "%d, \"%S\"")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_global,  (__itt_mark_type mt, const char    *parameter), (ITT_FORMAT mt, parameter), mark_global,  __itt_group_mark, "%d, \"%S\"")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int,  mark_global_off, (__itt_mark_type mt),                        (ITT_FORMAT mt),            mark_global_off, __itt_group_mark, "%d")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void), (ITT_NO_PARAMS), stack_caller_create,  __itt_group_stitch, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id), (ITT_FORMAT id), stack_caller_destroy, __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_enter,   (__itt_caller id), (ITT_FORMAT id), stack_callee_enter,   __itt_group_stitch, "%p")
+ITT_STUBV(ITTAPI, void, stack_callee_leave,   (__itt_caller id), (ITT_FORMAT id), stack_callee_leave,   __itt_group_stitch, "%p")
+
+ITT_STUB(ITTAPI,  __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data), (ITT_FORMAT fn, fn_data), clock_domain_create, __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void,                clock_domain_reset,  (void),                                      (ITT_NO_PARAMS),          clock_domain_reset,  __itt_group_structure, "no args")
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_create_ex,  __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_destroy_ex, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn),                  (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, fn), task_begin_fn_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_ex,      (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp),                                                            (ITT_FORMAT domain, clock_domain, timestamp), task_end_ex, __itt_group_structure, "%p, %p, %lu")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped,       (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name),                                                                   (ITT_FORMAT domain, id, parent, name), task_begin_overlapped, __itt_group_structure, "%p, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,    (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p")
+ITT_STUBV(ITTAPI, void, task_end_overlapped, (const __itt_domain *domain, __itt_id id),                                                                                                                       (ITT_FORMAT domain, id), task_end_overlapped, __itt_group_structure, "%p, %lu")
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id),                                                    (ITT_FORMAT domain, clock_domain, timestamp, id), task_end_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu")
+ITT_STUBV(ITTAPI, void, marker_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, clock_domain, timestamp, id, name, scope), marker_ex, __itt_group_structure, "%p, %p, %lu, %lu, %p, %d")
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, scope, key, type, count, data), metadata_add_with_scope, __itt_group_structure, "%p, %d, %p, %d, %lu, %p")
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeA, __itt_group_structure, "%p, %d, %p, %p, %lu")
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length), (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeW, __itt_group_structure, "%p, %d, %p, %p, %lu")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope,  (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length),    (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scope,  __itt_group_structure, "%p, %d, %p, %p, %lu")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail),                (ITT_FORMAT domain, clock_domain, timestamp, relation, tail),       relation_add_to_current_ex, __itt_group_structure, "%p, %p, %lu, %d, %lu")
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, clock_domain, timestamp, head, relation, tail), relation_add_ex,            __itt_group_structure, "%p, %p, %lu, %lu, %d, %lu")
+ITT_STUB(ITTAPI,  __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type),                    (ITT_FORMAT name, track_group_type),        track_group_create, __itt_group_structure, "%p, %d")
+ITT_STUB(ITTAPI,  __itt_track*,       track_create,       (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type), (ITT_FORMAT track_group, name, track_type), track_create,       __itt_group_structure, "%p, %p, %d")
+ITT_STUBV(ITTAPI, void,               set_track,          (__itt_track *track),                                                                    (ITT_FORMAT track),                         set_track,          __itt_group_structure, "%p")
+
+#ifndef __ITT_INTERNAL_BODY
+ITT_STUB(ITTAPI, const char*, api_version, (void), (ITT_NO_PARAMS), api_version, __itt_group_all & ~__itt_group_legacy, "no args")
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveA, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveW, __itt_group_arrays, "%p, %d, %p, %d, \"%S\", %d")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_save,  __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+
+#ifndef __ITT_INTERNAL_BODY
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, module_loadA, (void *start_addr, void* end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_loadA, __itt_group_none, "%p, %p, %p")
+ITT_STUBV(ITTAPI, void, module_loadW, (void *start_addr, void* end_addr, const wchar_t *path), (ITT_FORMAT start_addr, end_addr, path), module_loadW, __itt_group_none, "%p, %p, %p")
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, module_load, (void *start_addr, void *end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_load, __itt_group_none, "%p, %p, %p")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* __ITT_INTERNAL_BODY */
+
+
+#endif /* __ITT_INTERNAL_INIT */
diff --git a/final/runtime/src/thirdparty/ittnotify/ittnotify_types.h b/final/runtime/src/thirdparty/ittnotify/ittnotify_types.h
new file mode 100644
index 0000000..8818161
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/ittnotify_types.h
@@ -0,0 +1,67 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef _ITTNOTIFY_TYPES_H_
+#define _ITTNOTIFY_TYPES_H_
+
+typedef enum ___itt_group_id
+{
+    __itt_group_none      = 0,
+    __itt_group_legacy    = 1<<0,
+    __itt_group_control   = 1<<1,
+    __itt_group_thread    = 1<<2,
+    __itt_group_mark      = 1<<3,
+    __itt_group_sync      = 1<<4,
+    __itt_group_fsync     = 1<<5,
+    __itt_group_jit       = 1<<6,
+    __itt_group_model     = 1<<7,
+    __itt_group_splitter_min = 1<<7,
+    __itt_group_counter   = 1<<8,
+    __itt_group_frame     = 1<<9,
+    __itt_group_stitch    = 1<<10,
+    __itt_group_heap      = 1<<11,
+    __itt_group_splitter_max = 1<<12,
+    __itt_group_structure = 1<<12,
+    __itt_group_suppress = 1<<13,
+    __itt_group_arrays    = 1<<14,
+    __itt_group_all       = -1
+} __itt_group_id;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_list
+{
+    __itt_group_id id;
+    const char*    name;
+} __itt_group_list;
+
+#pragma pack(pop)
+
+#define ITT_GROUP_LIST(varname) \
+    static __itt_group_list varname[] = {       \
+        { __itt_group_all,       "all"       }, \
+        { __itt_group_control,   "control"   }, \
+        { __itt_group_thread,    "thread"    }, \
+        { __itt_group_mark,      "mark"      }, \
+        { __itt_group_sync,      "sync"      }, \
+        { __itt_group_fsync,     "fsync"     }, \
+        { __itt_group_jit,       "jit"       }, \
+        { __itt_group_model,     "model"     }, \
+        { __itt_group_counter,   "counter"   }, \
+        { __itt_group_frame,     "frame"     }, \
+        { __itt_group_stitch,    "stitch"    }, \
+        { __itt_group_heap,      "heap"      }, \
+        { __itt_group_structure, "structure" }, \
+        { __itt_group_suppress,  "suppress"  }, \
+        { __itt_group_arrays,    "arrays"    }, \
+        { __itt_group_none,      NULL        }  \
+    }
+
+#endif /* _ITTNOTIFY_TYPES_H_ */
diff --git a/final/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h b/final/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
new file mode 100644
index 0000000..eae33e0
--- /dev/null
+++ b/final/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
@@ -0,0 +1,991 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LEGACY_ITTNOTIFY_H_
+#define _LEGACY_ITTNOTIFY_H_
+
+/**
+ * @file
+ * @brief Legacy User API functions and types
+ */
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @defgroup legacy Legacy API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup legacy_control Collection Control
+ * @ingroup legacy
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+#ifndef _ITTNOTIFY_H_
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+/** @brief Detach collection */
+void ITTAPI __itt_detach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,   (void))
+ITT_STUBV(ITTAPI, void, resume,  (void))
+ITT_STUBV(ITTAPI, void, detach,  (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#define __itt_detach()
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+#endif /* _ITTNOTIFY_H_ */
+/** @} legacy_control group */
+
+/**
+ * @defgroup legacy_threads Threads
+ * @ingroup legacy
+ * Threads group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @deprecated Legacy API
+ * @brief Set name to be associated with thread in analysis GUI.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_thr_name_setA(const char    *name, int namelen);
+int LIBITTAPI __itt_thr_name_setW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thr_name_set     __itt_thr_name_setW
+#  define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr
+#else
+#  define __itt_thr_name_set     __itt_thr_name_setA
+#  define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_thr_name_set(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA     ITTNOTIFY_DATA(thr_name_setA)
+#define __itt_thr_name_setA_ptr ITTNOTIFY_NAME(thr_name_setA)
+#define __itt_thr_name_setW     ITTNOTIFY_DATA(thr_name_setW)
+#define __itt_thr_name_setW_ptr ITTNOTIFY_NAME(thr_name_setW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set     ITTNOTIFY_DATA(thr_name_set)
+#define __itt_thr_name_set_ptr ITTNOTIFY_NAME(thr_name_set)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA(name, namelen)
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW(name, namelen)
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set(name, namelen)
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void LIBITTAPI __itt_thr_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, thr_ignore, (void))
+#define __itt_thr_ignore     ITTNOTIFY_VOID(thr_ignore)
+#define __itt_thr_ignore_ptr ITTNOTIFY_NAME(thr_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_ignore()
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_threads group */
+
+/**
+ * @defgroup legacy_sync Synchronization
+ * @ingroup legacy
+ * Synchronization group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name to a sync object using char or Unicode string
+ * @param[in] addr    - pointer to the sync object. You should use a real pointer to your object
+ *                      to make sure that the values don't clash with other object addresses
+ * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will
+ *                      be assumed to be of generic "User Synchronization" type
+ * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned
+ *                      to the object -- you can use the __itt_sync_rename call later to assign
+ *                      the name
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                      exact semantics of how prepare/acquired/releasing calls work.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_set_nameA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_set_name     __itt_sync_set_nameW
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_set_name     __itt_sync_set_nameA
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_set_name(void *addr, const char* objtype, const char* objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA     ITTNOTIFY_VOID(sync_set_nameA)
+#define __itt_sync_set_nameA_ptr ITTNOTIFY_NAME(sync_set_nameA)
+#define __itt_sync_set_nameW     ITTNOTIFY_VOID(sync_set_nameW)
+#define __itt_sync_set_nameW_ptr ITTNOTIFY_NAME(sync_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name     ITTNOTIFY_VOID(sync_set_name)
+#define __itt_sync_set_name_ptr ITTNOTIFY_NAME(sync_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name(addr, objtype, objname, attribute)
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name and type to a sync object using char or Unicode string
+ * @param[in] addr -      pointer to the sync object. You should use a real pointer to your object
+ *                        to make sure that the values don't clash with other object addresses
+ * @param[in] objtype -   null-terminated object type string. If NULL is passed, the object will
+ *                        be assumed to be of generic "User Synchronization" type
+ * @param[in] objname -   null-terminated object name string. If NULL, no name will be assigned
+ *                        to the object -- you can use the __itt_sync_rename call later to assign
+ *                        the name
+ * @param[in] typelen, namelen -   a length of string for appropriate objtype and objname parameter
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                        exact semantics of how prepare/acquired/releasing calls work.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute);
+int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_notify_sync_name __itt_notify_sync_nameW
+#else
+#  define __itt_notify_sync_name __itt_notify_sync_nameA
+#endif
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA     ITTNOTIFY_DATA(notify_sync_nameA)
+#define __itt_notify_sync_nameA_ptr ITTNOTIFY_NAME(notify_sync_nameA)
+#define __itt_notify_sync_nameW     ITTNOTIFY_DATA(notify_sync_nameW)
+#define __itt_notify_sync_nameW_ptr ITTNOTIFY_NAME(notify_sync_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name     ITTNOTIFY_DATA(notify_sync_name)
+#define __itt_notify_sync_name_ptr ITTNOTIFY_NAME(notify_sync_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Enter spin loop on user-defined sync object
+ */
+void LIBITTAPI __itt_notify_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *addr))
+#define __itt_notify_sync_prepare     ITTNOTIFY_VOID(notify_sync_prepare)
+#define __itt_notify_sync_prepare_ptr ITTNOTIFY_NAME(notify_sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_prepare(addr)
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Quit spin loop without acquiring spin object
+ */
+void LIBITTAPI __itt_notify_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *addr))
+#define __itt_notify_sync_cancel     ITTNOTIFY_VOID(notify_sync_cancel)
+#define __itt_notify_sync_cancel_ptr ITTNOTIFY_NAME(notify_sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_cancel(addr)
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void LIBITTAPI __itt_notify_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *addr))
+#define __itt_notify_sync_acquired     ITTNOTIFY_VOID(notify_sync_acquired)
+#define __itt_notify_sync_acquired_ptr ITTNOTIFY_NAME(notify_sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_acquired(addr)
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void LIBITTAPI __itt_notify_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *addr))
+#define __itt_notify_sync_releasing     ITTNOTIFY_VOID(notify_sync_releasing)
+#define __itt_notify_sync_releasing_ptr ITTNOTIFY_NAME(notify_sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_releasing(addr)
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_sync group */
+
+#ifndef _ITTNOTIFY_H_
+/**
+ * @defgroup legacy_events Events
+ * @ingroup legacy
+ * Events group
+ * @{
+ */
+
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_events group */
+#endif /* _ITTNOTIFY_H_ */
+
+/**
+ * @defgroup legacy_memory Memory Accesses
+ * @ingroup legacy
+ */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on reading
+ */
+void LIBITTAPI __itt_memory_read(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size))
+#define __itt_memory_read     ITTNOTIFY_VOID(memory_read)
+#define __itt_memory_read_ptr ITTNOTIFY_NAME(memory_read)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_read(addr, size)
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on writing
+ */
+void LIBITTAPI __itt_memory_write(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size))
+#define __itt_memory_write     ITTNOTIFY_VOID(memory_write)
+#define __itt_memory_write_ptr ITTNOTIFY_NAME(memory_write)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_write(addr, size)
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on updating
+ */
+void LIBITTAPI __itt_memory_update(void *address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size))
+#define __itt_memory_update     ITTNOTIFY_VOID(memory_update)
+#define __itt_memory_update_ptr ITTNOTIFY_NAME(memory_update)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_update(addr, size)
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_memory group */
+
+/**
+ * @defgroup legacy_state Thread and Object States
+ * @ingroup legacy
+ */
+
+/** @brief state type */
+typedef int __itt_state_t;
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_obj_state {
+    __itt_obj_state_err = 0,
+    __itt_obj_state_clr = 1,
+    __itt_obj_state_set = 2,
+    __itt_obj_state_use = 3
+} __itt_obj_state_t;
+
+typedef enum __itt_thr_state {
+    __itt_thr_state_err = 0,
+    __itt_thr_state_clr = 1,
+    __itt_thr_state_set = 2
+} __itt_thr_state_t;
+
+typedef enum __itt_obj_prop {
+    __itt_obj_prop_watch    = 1,
+    __itt_obj_prop_ignore   = 2,
+    __itt_obj_prop_sharable = 3
+} __itt_obj_prop_t;
+
+typedef enum __itt_thr_prop {
+    __itt_thr_prop_quiet = 1
+} __itt_thr_prop_t;
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_get(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_get, (void))
+#define __itt_state_get     ITTNOTIFY_DATA(state_get)
+#define __itt_state_get_ptr ITTNOTIFY_NAME(state_get)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_get(void) (__itt_state_t)0
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_set(__itt_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s))
+#define __itt_state_set     ITTNOTIFY_DATA(state_set)
+#define __itt_state_set_ptr ITTNOTIFY_NAME(state_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_set(s) (__itt_state_t)0
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p, __itt_thr_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s))
+#define __itt_thr_mode_set     ITTNOTIFY_DATA(thr_mode_set)
+#define __itt_thr_mode_set_ptr ITTNOTIFY_NAME(thr_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_mode_set(p, s) (__itt_thr_state_t)0
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p, __itt_obj_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s))
+#define __itt_obj_mode_set     ITTNOTIFY_DATA(obj_mode_set)
+#define __itt_obj_mode_set_ptr ITTNOTIFY_NAME(obj_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_obj_mode_set(p, s) (__itt_obj_state_t)0
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_state group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup legacy
+ * Frames group
+ * @{
+ */
+/**
+ * @brief opaque structure for frame identification
+ */
+typedef struct __itt_frame_t *__itt_frame;
+
+/**
+ * @brief Create a global frame with given domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_frame ITTAPI __itt_frame_createA(const char    *domain);
+__itt_frame ITTAPI __itt_frame_createW(const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_frame_create     __itt_frame_createW
+#  define __itt_frame_create_ptr __itt_frame_createW_ptr
+#else /* UNICODE */
+#  define __itt_frame_create     __itt_frame_createA
+#  define __itt_frame_create_ptr __itt_frame_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_frame ITTAPI __itt_frame_create(const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain))
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA     ITTNOTIFY_DATA(frame_createA)
+#define __itt_frame_createA_ptr ITTNOTIFY_NAME(frame_createA)
+#define __itt_frame_createW     ITTNOTIFY_DATA(frame_createW)
+#define __itt_frame_createW_ptr ITTNOTIFY_NAME(frame_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create     ITTNOTIFY_DATA(frame_create)
+#define __itt_frame_create_ptr ITTNOTIFY_NAME(frame_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA(domain)
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW(domain)
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create(domain)
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief Record an frame begin occurrence. */
+void ITTAPI __itt_frame_begin(__itt_frame frame);
+/** @brief Record an frame end occurrence. */
+void ITTAPI __itt_frame_end  (__itt_frame frame);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame))
+ITT_STUBV(ITTAPI, void, frame_end,   (__itt_frame frame))
+#define __itt_frame_begin     ITTNOTIFY_VOID(frame_begin)
+#define __itt_frame_begin_ptr ITTNOTIFY_NAME(frame_begin)
+#define __itt_frame_end       ITTNOTIFY_VOID(frame_end)
+#define __itt_frame_end_ptr   ITTNOTIFY_NAME(frame_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin(frame)
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end(frame)
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LEGACY_ITTNOTIFY_H_ */
diff --git a/final/runtime/src/tsan_annotations.cpp b/final/runtime/src/tsan_annotations.cpp
new file mode 100644
index 0000000..5be17f8
--- /dev/null
+++ b/final/runtime/src/tsan_annotations.cpp
@@ -0,0 +1,107 @@
+/*
+ * tsan_annotations.cpp -- ThreadSanitizer annotations to support data
+ * race detection in OpenMP programs.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "tsan_annotations.h"
+
+#include <stdio.h>
+
+typedef unsigned long uptr;
+typedef signed long sptr;
+
+extern "C" __attribute__((weak)) void AnnotateHappensBefore(const char *f,
+                                                            int l, uptr addr) {}
+extern "C" __attribute__((weak)) void AnnotateHappensAfter(const char *f, int l,
+                                                           uptr addr) {}
+extern "C" __attribute__((weak)) void AnnotateCondVarSignal(const char *f,
+                                                            int l, uptr cv) {}
+extern "C" __attribute__((weak)) void AnnotateCondVarSignalAll(const char *f,
+                                                               int l, uptr cv) {
+}
+extern "C" __attribute__((weak)) void AnnotateMutexIsNotPHB(const char *f,
+                                                            int l, uptr mu) {}
+extern "C" __attribute__((weak)) void AnnotateCondVarWait(const char *f, int l,
+                                                          uptr cv, uptr lock) {}
+extern "C" __attribute__((weak)) void AnnotateRWLockCreate(const char *f, int l,
+                                                           uptr m) {}
+extern "C" __attribute__((weak)) void
+AnnotateRWLockCreateStatic(const char *f, int l, uptr m) {}
+extern "C" __attribute__((weak)) void AnnotateRWLockDestroy(const char *f,
+                                                            int l, uptr m) {}
+extern "C" __attribute__((weak)) void
+AnnotateRWLockAcquired(const char *f, int l, uptr m, uptr is_w) {}
+extern "C" __attribute__((weak)) void
+AnnotateRWLockReleased(const char *f, int l, uptr m, uptr is_w) {}
+extern "C" __attribute__((weak)) void AnnotateTraceMemory(const char *f, int l,
+                                                          uptr mem) {}
+extern "C" __attribute__((weak)) void AnnotateFlushState(const char *f, int l) {
+}
+extern "C" __attribute__((weak)) void AnnotateNewMemory(const char *f, int l,
+                                                        uptr mem, uptr size) {}
+extern "C" __attribute__((weak)) void AnnotateNoOp(const char *f, int l,
+                                                   uptr mem) {}
+extern "C" __attribute__((weak)) void AnnotateFlushExpectedRaces(const char *f,
+                                                                 int l) {}
+extern "C" __attribute__((weak)) void
+AnnotateEnableRaceDetection(const char *f, int l, int enable) {}
+extern "C" __attribute__((weak)) void
+AnnotateMutexIsUsedAsCondVar(const char *f, int l, uptr mu) {}
+extern "C" __attribute__((weak)) void AnnotatePCQGet(const char *f, int l,
+                                                     uptr pcq) {}
+extern "C" __attribute__((weak)) void AnnotatePCQPut(const char *f, int l,
+                                                     uptr pcq) {}
+extern "C" __attribute__((weak)) void AnnotatePCQDestroy(const char *f, int l,
+                                                         uptr pcq) {}
+extern "C" __attribute__((weak)) void AnnotatePCQCreate(const char *f, int l,
+                                                        uptr pcq) {}
+extern "C" __attribute__((weak)) void AnnotateExpectRace(const char *f, int l,
+                                                         uptr mem, char *desc) {
+}
+extern "C" __attribute__((weak)) void
+AnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr size, char *desc) {
+}
+extern "C" __attribute__((weak)) void AnnotateBenignRace(const char *f, int l,
+                                                         uptr mem, char *desc) {
+}
+extern "C" __attribute__((weak)) void AnnotateIgnoreReadsBegin(const char *f,
+                                                               int l) {}
+extern "C" __attribute__((weak)) void AnnotateIgnoreReadsEnd(const char *f,
+                                                             int l) {}
+extern "C" __attribute__((weak)) void AnnotateIgnoreWritesBegin(const char *f,
+                                                                int l) {}
+extern "C" __attribute__((weak)) void AnnotateIgnoreWritesEnd(const char *f,
+                                                              int l) {}
+extern "C" __attribute__((weak)) void AnnotateIgnoreSyncBegin(const char *f,
+                                                              int l) {}
+extern "C" __attribute__((weak)) void AnnotateIgnoreSyncEnd(const char *f,
+                                                            int l) {}
+extern "C" __attribute__((weak)) void
+AnnotatePublishMemoryRange(const char *f, int l, uptr addr, uptr size) {}
+extern "C" __attribute__((weak)) void
+AnnotateUnpublishMemoryRange(const char *f, int l, uptr addr, uptr size) {}
+extern "C" __attribute__((weak)) void AnnotateThreadName(const char *f, int l,
+                                                         char *name) {}
+extern "C" __attribute__((weak)) void
+WTFAnnotateHappensBefore(const char *f, int l, uptr addr) {}
+extern "C" __attribute__((weak)) void
+WTFAnnotateHappensAfter(const char *f, int l, uptr addr) {}
+extern "C" __attribute__((weak)) void
+WTFAnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr sz,
+                           char *desc) {}
+extern "C" __attribute__((weak)) int RunningOnValgrind() { return 0; }
+extern "C" __attribute__((weak)) double ValgrindSlowdown(void) { return 0; }
+extern "C" __attribute__((weak)) const char __attribute__((weak)) *
+    ThreadSanitizerQuery(const char *query) {
+  return 0;
+}
+extern "C" __attribute__((weak)) void
+AnnotateMemoryIsInitialized(const char *f, int l, uptr mem, uptr sz) {}
diff --git a/final/runtime/src/tsan_annotations.h b/final/runtime/src/tsan_annotations.h
new file mode 100644
index 0000000..2b1debb
--- /dev/null
+++ b/final/runtime/src/tsan_annotations.h
@@ -0,0 +1,169 @@
+/*! \file */
+/*
+ * tsan_annotations.h -- ThreadSanitizer annotations to support data
+ * race detection in OpenMP programs.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TSAN_ANNOTATIONS_H
+#define TSAN_ANNOTATIONS_H
+
+#include "kmp_config.h"
+
+/* types as used in tsan/rtl/tsan_interface_ann.cc */
+typedef unsigned long uptr;
+typedef signed long sptr;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Declaration of all annotation functions in tsan/rtl/tsan_interface_ann.cc */
+void AnnotateHappensBefore(const char *f, int l, uptr addr);
+void AnnotateHappensAfter(const char *f, int l, uptr addr);
+void AnnotateCondVarSignal(const char *f, int l, uptr cv);
+void AnnotateCondVarSignalAll(const char *f, int l, uptr cv);
+void AnnotateMutexIsNotPHB(const char *f, int l, uptr mu);
+void AnnotateCondVarWait(const char *f, int l, uptr cv, uptr lock);
+void AnnotateRWLockCreate(const char *f, int l, uptr m);
+void AnnotateRWLockCreateStatic(const char *f, int l, uptr m);
+void AnnotateRWLockDestroy(const char *f, int l, uptr m);
+void AnnotateRWLockAcquired(const char *f, int l, uptr m, uptr is_w);
+void AnnotateRWLockReleased(const char *f, int l, uptr m, uptr is_w);
+void AnnotateTraceMemory(const char *f, int l, uptr mem);
+void AnnotateFlushState(const char *f, int l);
+void AnnotateNewMemory(const char *f, int l, uptr mem, uptr size);
+void AnnotateNoOp(const char *f, int l, uptr mem);
+void AnnotateFlushExpectedRaces(const char *f, int l);
+void AnnotateEnableRaceDetection(const char *f, int l, int enable);
+void AnnotateMutexIsUsedAsCondVar(const char *f, int l, uptr mu);
+void AnnotatePCQGet(const char *f, int l, uptr pcq);
+void AnnotatePCQPut(const char *f, int l, uptr pcq);
+void AnnotatePCQDestroy(const char *f, int l, uptr pcq);
+void AnnotatePCQCreate(const char *f, int l, uptr pcq);
+void AnnotateExpectRace(const char *f, int l, uptr mem, char *desc);
+void AnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr size,
+                             char *desc);
+void AnnotateBenignRace(const char *f, int l, uptr mem, char *desc);
+void AnnotateIgnoreReadsBegin(const char *f, int l);
+void AnnotateIgnoreReadsEnd(const char *f, int l);
+void AnnotateIgnoreWritesBegin(const char *f, int l);
+void AnnotateIgnoreWritesEnd(const char *f, int l);
+void AnnotateIgnoreSyncBegin(const char *f, int l);
+void AnnotateIgnoreSyncEnd(const char *f, int l);
+void AnnotatePublishMemoryRange(const char *f, int l, uptr addr, uptr size);
+void AnnotateUnpublishMemoryRange(const char *f, int l, uptr addr, uptr size);
+void AnnotateThreadName(const char *f, int l, char *name);
+void WTFAnnotateHappensBefore(const char *f, int l, uptr addr);
+void WTFAnnotateHappensAfter(const char *f, int l, uptr addr);
+void WTFAnnotateBenignRaceSized(const char *f, int l, uptr mem, uptr sz,
+                                char *desc);
+int RunningOnValgrind();
+double ValgrindSlowdown(void);
+const char *ThreadSanitizerQuery(const char *query);
+void AnnotateMemoryIsInitialized(const char *f, int l, uptr mem, uptr sz);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef TSAN_SUPPORT
+#define ANNOTATE_HAPPENS_AFTER(addr)                                           \
+  AnnotateHappensAfter(__FILE__, __LINE__, (uptr)addr)
+#define ANNOTATE_HAPPENS_BEFORE(addr)                                          \
+  AnnotateHappensBefore(__FILE__, __LINE__, (uptr)addr)
+#define ANNOTATE_IGNORE_WRITES_BEGIN()                                         \
+  AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+#define ANNOTATE_IGNORE_WRITES_END() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+#define ANNOTATE_RWLOCK_CREATE(lck)                                            \
+  AnnotateRWLockCreate(__FILE__, __LINE__, (uptr)lck)
+#define ANNOTATE_RWLOCK_RELEASED(lck)                                          \
+  AnnotateRWLockAcquired(__FILE__, __LINE__, (uptr)lck, 1)
+#define ANNOTATE_RWLOCK_ACQUIRED(lck)                                          \
+  AnnotateRWLockReleased(__FILE__, __LINE__, (uptr)lck, 1)
+#define ANNOTATE_BARRIER_BEGIN(addr)                                           \
+  AnnotateHappensBefore(__FILE__, __LINE__, (uptr)addr)
+#define ANNOTATE_BARRIER_END(addr)                                             \
+  AnnotateHappensAfter(__FILE__, __LINE__, (uptr)addr)
+#define ANNOTATE_REDUCE_AFTER(addr)                                            \
+  AnnotateHappensAfter(__FILE__, __LINE__, (uptr)addr)
+#define ANNOTATE_REDUCE_BEFORE(addr)                                           \
+  AnnotateHappensBefore(__FILE__, __LINE__, (uptr)addr)
+#else
+#define ANNOTATE_HAPPENS_AFTER(addr)
+#define ANNOTATE_HAPPENS_BEFORE(addr)
+#define ANNOTATE_IGNORE_WRITES_BEGIN()
+#define ANNOTATE_IGNORE_WRITES_END()
+#define ANNOTATE_RWLOCK_CREATE(lck)
+#define ANNOTATE_RWLOCK_RELEASED(lck)
+#define ANNOTATE_RWLOCK_ACQUIRED(lck)
+#define ANNOTATE_BARRIER_BEGIN(addr)
+#define ANNOTATE_BARRIER_END(addr)
+#define ANNOTATE_REDUCE_AFTER(addr)
+#define ANNOTATE_REDUCE_BEFORE(addr)
+#endif
+
+#define ANNOTATE_QUEUING
+#define ANNOTATE_TICKET
+#define ANNOTATE_FUTEX
+#define ANNOTATE_TAS
+#define ANNOTATE_DRDPA
+
+#ifdef ANNOTATE_QUEUING
+#define ANNOTATE_QUEUING_CREATE(lck)
+#define ANNOTATE_QUEUING_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
+#define ANNOTATE_QUEUING_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
+#else
+#define ANNOTATE_QUEUING_CREATE(lck)
+#define ANNOTATE_QUEUING_RELEASED(lck)
+#define ANNOTATE_QUEUING_ACQUIRED(lck)
+#endif
+
+#ifdef ANNOTATE_TICKET
+#define ANNOTATE_TICKET_CREATE(lck)
+#define ANNOTATE_TICKET_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
+#define ANNOTATE_TICKET_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
+#else
+#define ANNOTATE_TICKET_CREATE(lck)
+#define ANNOTATE_TICKET_RELEASED(lck)
+#define ANNOTATE_TICKET_ACQUIRED(lck)
+#endif
+
+#ifdef ANNOTATE_FUTEX
+#define ANNOTATE_FUTEX_CREATE(lck)
+#define ANNOTATE_FUTEX_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
+#define ANNOTATE_FUTEX_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
+#else
+#define ANNOTATE_FUTEX_CREATE(lck)
+#define ANNOTATE_FUTEX_RELEASED(lck)
+#define ANNOTATE_FUTEX_ACQUIRED(lck)
+#endif
+
+#ifdef ANNOTATE_TAS
+#define ANNOTATE_TAS_CREATE(lck)
+#define ANNOTATE_TAS_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
+#define ANNOTATE_TAS_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
+#else
+#define ANNOTATE_TAS_CREATE(lck)
+#define ANNOTATE_TAS_RELEASED(lck)
+#define ANNOTATE_TAS_ACQUIRED(lck)
+#endif
+
+#ifdef ANNOTATE_DRDPA
+#define ANNOTATE_DRDPA_CREATE(lck)
+#define ANNOTATE_DRDPA_RELEASED(lck) ANNOTATE_HAPPENS_BEFORE(lck)
+#define ANNOTATE_DRDPA_ACQUIRED(lck) ANNOTATE_HAPPENS_AFTER(lck)
+#else
+#define ANNOTATE_DRDPA_CREATE(lck)
+#define ANNOTATE_DRDPA_RELEASED(lck)
+#define ANNOTATE_DRDPA_ACQUIRED(lck)
+#endif
+
+#endif
diff --git a/final/runtime/src/z_Linux_asm.S b/final/runtime/src/z_Linux_asm.S
new file mode 100644
index 0000000..0d8885e
--- /dev/null
+++ b/final/runtime/src/z_Linux_asm.S
@@ -0,0 +1,1555 @@
+//  z_Linux_asm.S:  - microtasking routines specifically
+//                    written for Intel platforms running Linux* OS
+
+//
+////===----------------------------------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+//
+
+// -----------------------------------------------------------------------
+// macros
+// -----------------------------------------------------------------------
+
+#include "kmp_config.h"
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+
+# if KMP_MIC
+// the 'delay r16/r32/r64' should be used instead of the 'pause'.
+// The delay operation has the effect of removing the current thread from
+// the round-robin HT mechanism, and therefore speeds up the issue rate of
+// the other threads on the same core.
+//
+// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
+// barrier time to increase greatly for 3 or more threads per core.
+//
+// A value of 100 works pretty well for up to 4 threads per core, but isn't
+// quite as fast as 0 for 2 threads per core.
+//
+// We need to check what happens for oversubscription / > 4 threads per core.
+// It is possible that we need to pass the delay value in as a parameter
+// that the caller determines based on the total # threads / # cores.
+//
+//.macro pause_op
+//	mov    $100, %rax
+//	delay  %rax
+//.endm
+# else
+#  define pause_op   .byte 0xf3,0x90
+# endif // KMP_MIC
+
+# if KMP_OS_DARWIN
+#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
+#  define KMP_LABEL(x) L_##x             // form the name of label
+.macro KMP_CFI_DEF_OFFSET
+.endmacro
+.macro KMP_CFI_OFFSET
+.endmacro
+.macro KMP_CFI_REGISTER
+.endmacro
+.macro KMP_CFI_DEF
+.endmacro
+.macro ALIGN
+	.align $0
+.endmacro
+.macro DEBUG_INFO
+/* Not sure what .size does in icc, not sure if we need to do something
+   similar for OS X*.
+*/
+.endmacro
+.macro PROC
+	ALIGN  4
+	.globl KMP_PREFIX_UNDERSCORE($0)
+KMP_PREFIX_UNDERSCORE($0):
+.endmacro
+# else // KMP_OS_DARWIN
+#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
+// Format labels so that they don't override function names in gdb's backtraces
+// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
+// on OS X*)
+# if KMP_MIC
+#  define KMP_LABEL(x) L_##x          // local label
+# else
+#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
+# endif // KMP_MIC
+.macro ALIGN size
+	.align 1<<(\size)
+.endm
+.macro DEBUG_INFO proc
+	.cfi_endproc
+// Not sure why we need .type and .size for the functions
+	.align 16
+	.type  \proc,@function
+        .size  \proc,.-\proc
+.endm
+.macro PROC proc
+	ALIGN  4
+        .globl KMP_PREFIX_UNDERSCORE(\proc)
+KMP_PREFIX_UNDERSCORE(\proc):
+	.cfi_startproc
+.endm
+.macro KMP_CFI_DEF_OFFSET sz
+	.cfi_def_cfa_offset	\sz
+.endm
+.macro KMP_CFI_OFFSET reg, sz
+	.cfi_offset	\reg,\sz
+.endm
+.macro KMP_CFI_REGISTER reg
+	.cfi_def_cfa_register	\reg
+.endm
+.macro KMP_CFI_DEF reg, sz
+	.cfi_def_cfa	\reg,\sz
+.endm
+# endif // KMP_OS_DARWIN
+#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
+
+#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
+
+# if KMP_OS_DARWIN
+#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
+#  define KMP_LABEL(x) L_##x             // form the name of label
+
+.macro ALIGN
+	.align $0
+.endmacro
+
+.macro DEBUG_INFO
+/* Not sure what .size does in icc, not sure if we need to do something
+   similar for OS X*.
+*/
+.endmacro
+
+.macro PROC
+	ALIGN  4
+	.globl KMP_PREFIX_UNDERSCORE($0)
+KMP_PREFIX_UNDERSCORE($0):
+.endmacro
+# else // KMP_OS_DARWIN
+#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
+// Format labels so that they don't override function names in gdb's backtraces
+#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
+
+.macro ALIGN size
+	.align 1<<(\size)
+.endm
+
+.macro DEBUG_INFO proc
+	.cfi_endproc
+// Not sure why we need .type and .size for the functions
+	ALIGN 2
+	.type  \proc,@function
+	.size  \proc,.-\proc
+.endm
+
+.macro PROC proc
+	ALIGN 2
+	.globl KMP_PREFIX_UNDERSCORE(\proc)
+KMP_PREFIX_UNDERSCORE(\proc):
+	.cfi_startproc
+.endm
+# endif // KMP_OS_DARWIN
+
+#endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
+
+// -----------------------------------------------------------------------
+// data
+// -----------------------------------------------------------------------
+
+#ifdef KMP_GOMP_COMPAT
+
+// Support for unnamed common blocks.
+//
+// Because the symbol ".gomp_critical_user_" contains a ".", we have to
+// put this stuff in assembly.
+
+# if KMP_ARCH_X86
+#  if KMP_OS_DARWIN
+        .data
+        .comm .gomp_critical_user_,32
+        .data
+        .globl ___kmp_unnamed_critical_addr
+___kmp_unnamed_critical_addr:
+        .long .gomp_critical_user_
+#  else /* Linux* OS */
+        .data
+        .comm .gomp_critical_user_,32,8
+        .data
+	ALIGN 4
+        .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+        .4byte .gomp_critical_user_
+        .type __kmp_unnamed_critical_addr,@object
+        .size __kmp_unnamed_critical_addr,4
+#  endif /* KMP_OS_DARWIN */
+# endif /* KMP_ARCH_X86 */
+
+# if KMP_ARCH_X86_64
+#  if KMP_OS_DARWIN
+        .data
+        .comm .gomp_critical_user_,32
+        .data
+        .globl ___kmp_unnamed_critical_addr
+___kmp_unnamed_critical_addr:
+        .quad .gomp_critical_user_
+#  else /* Linux* OS */
+        .data
+        .comm .gomp_critical_user_,32,8
+        .data
+	ALIGN 8
+        .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+        .8byte .gomp_critical_user_
+        .type __kmp_unnamed_critical_addr,@object
+        .size __kmp_unnamed_critical_addr,8
+#  endif /* KMP_OS_DARWIN */
+# endif /* KMP_ARCH_X86_64 */
+
+#endif /* KMP_GOMP_COMPAT */
+
+
+#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
+
+// -----------------------------------------------------------------------
+// microtasking routines specifically written for IA-32 architecture
+// running Linux* OS
+// -----------------------------------------------------------------------
+
+	.ident "Intel Corporation"
+	.data
+	ALIGN 4
+// void
+// __kmp_x86_pause( void );
+
+        .text
+	PROC  __kmp_x86_pause
+
+        pause_op
+        ret
+
+	DEBUG_INFO __kmp_x86_pause
+
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+// kmp_int32
+// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+
+        PROC      __kmp_test_then_add32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        lock
+        xaddl     %eax,(%ecx)
+        ret
+
+	DEBUG_INFO __kmp_test_then_add32
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed8
+//
+// kmp_int32
+// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+//
+// return:	%al
+        PROC  __kmp_xchg_fixed8
+
+        movl      4(%esp), %ecx    // "p"
+        movb      8(%esp), %al	// "d"
+
+        lock
+        xchgb     %al,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed16
+//
+// kmp_int16
+// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+// return:     %ax
+        PROC  __kmp_xchg_fixed16
+
+        movl      4(%esp), %ecx    // "p"
+        movw      8(%esp), %ax	// "d"
+
+        lock
+        xchgw     %ax,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed32
+//
+// kmp_int32
+// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	4(%esp)
+// 	d:	8(%esp)
+//
+// return:	%eax
+        PROC  __kmp_xchg_fixed32
+
+        movl      4(%esp), %ecx    // "p"
+        movl      8(%esp), %eax	// "d"
+
+        lock
+        xchgl     %eax,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed32
+
+
+// kmp_int8
+// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+        PROC  __kmp_compare_and_store8
+
+        movl      4(%esp), %ecx
+        movb      8(%esp), %al
+        movb      12(%esp), %dl
+        lock
+        cmpxchgb  %dl,(%ecx)
+        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store8
+
+// kmp_int16
+// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
+        PROC  __kmp_compare_and_store16
+
+        movl      4(%esp), %ecx
+        movw      8(%esp), %ax
+        movw      12(%esp), %dx
+        lock
+        cmpxchgw  %dx,(%ecx)
+        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax      // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store16
+
+// kmp_int32
+// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
+        PROC  __kmp_compare_and_store32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        movl      12(%esp), %edx
+        lock
+        cmpxchgl  %edx,(%ecx)
+        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
+        and       $1, %eax     // sign extend previous instruction
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store32
+
+// kmp_int32
+// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
+        PROC  __kmp_compare_and_store64
+
+        pushl     %ebp
+        movl      %esp, %ebp
+        pushl     %ebx
+        pushl     %edi
+        movl      8(%ebp), %edi
+        movl      12(%ebp), %eax        // "cv" low order word
+        movl      16(%ebp), %edx        // "cv" high order word
+        movl      20(%ebp), %ebx        // "sv" low order word
+        movl      24(%ebp), %ecx        // "sv" high order word
+        lock
+        cmpxchg8b (%edi)
+        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
+        and       $1, %eax // sign extend previous instruction
+        popl      %edi
+        popl      %ebx
+        movl      %ebp, %esp
+        popl      %ebp
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store64
+
+// kmp_int8
+// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
+        PROC  __kmp_compare_and_store_ret8
+
+        movl      4(%esp), %ecx
+        movb      8(%esp), %al
+        movb      12(%esp), %dl
+        lock
+        cmpxchgb  %dl,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret8
+
+// kmp_int16
+// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
+//                               kmp_int16 sv);
+        PROC  __kmp_compare_and_store_ret16
+
+        movl      4(%esp), %ecx
+        movw      8(%esp), %ax
+        movw      12(%esp), %dx
+        lock
+        cmpxchgw  %dx,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret16
+
+// kmp_int32
+// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
+//                               kmp_int32 sv);
+        PROC  __kmp_compare_and_store_ret32
+
+        movl      4(%esp), %ecx
+        movl      8(%esp), %eax
+        movl      12(%esp), %edx
+        lock
+        cmpxchgl  %edx,(%ecx)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret32
+
+// kmp_int64
+// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
+//                               kmp_int64 sv);
+        PROC  __kmp_compare_and_store_ret64
+
+        pushl     %ebp
+        movl      %esp, %ebp
+        pushl     %ebx
+        pushl     %edi
+        movl      8(%ebp), %edi
+        movl      12(%ebp), %eax        // "cv" low order word
+        movl      16(%ebp), %edx        // "cv" high order word
+        movl      20(%ebp), %ebx        // "sv" low order word
+        movl      24(%ebp), %ecx        // "sv" high order word
+        lock
+        cmpxchg8b (%edi)
+        popl      %edi
+        popl      %ebx
+        movl      %ebp, %esp
+        popl      %ebp
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret64
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_real32
+//
+// kmp_real32
+// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
+//
+// parameters:
+// 	addr:	4(%esp)
+// 	data:	8(%esp)
+//
+// return:	%eax
+        PROC  __kmp_xchg_real32
+
+        pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        pushl   %esi
+
+        movl    4(%ebp), %esi
+        flds    (%esi)
+                        // load <addr>
+        fsts    -4(%ebp)
+                        // store old value
+
+        movl    8(%ebp), %eax
+
+        lock
+        xchgl   %eax, (%esi)
+
+        flds    -4(%ebp)
+                        // return old value
+
+        popl    %esi
+        movl    %ebp, %esp
+        popl    %ebp
+        ret
+
+        DEBUG_INFO __kmp_xchg_real32
+
+# endif /* !KMP_ASM_INTRINS */
+
+//------------------------------------------------------------------------
+// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
+//
+// int
+// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
+//                         int argc, void *p_argv[] ) {
+//    (*pkfn)( & gtid, & gtid, argv[0], ... );
+//    return 1;
+// }
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	PROC  __kmp_invoke_microtask
+
+	pushl %ebp
+	KMP_CFI_DEF_OFFSET 8
+	KMP_CFI_OFFSET ebp,-8
+	movl %esp,%ebp		// establish the base pointer for this routine.
+	KMP_CFI_REGISTER ebp
+	subl $8,%esp		// allocate space for two local variables.
+				// These varibales are:
+				//	argv: -4(%ebp)
+				//	temp: -8(%ebp)
+				//
+	pushl %ebx		// save %ebx to use during this routine
+				//
+#if OMPT_SUPPORT
+	movl 28(%ebp),%ebx	// get exit_frame address
+	movl %ebp,(%ebx)	// save exit_frame
+#endif
+
+	movl 20(%ebp),%ebx	// Stack alignment - # args
+	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
+	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
+	movl %esp,%eax		//
+	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
+	movl %eax,%ebx		// Save to %ebx
+	andl $0xFFFFFF80,%eax	// mask off 7 bits
+	subl %eax,%ebx		// Amount to subtract from %esp
+	subl %ebx,%esp		// Prepare the stack ptr --
+				//   now it will be aligned on 128-byte boundary at the call
+
+	movl 24(%ebp),%eax	// copy from p_argv[]
+	movl %eax,-4(%ebp)	// into the local variable *argv.
+
+	movl 20(%ebp),%ebx	// argc is 20(%ebp)
+	shll $2,%ebx
+
+KMP_LABEL(invoke_2):
+	cmpl $0,%ebx
+	jg  KMP_LABEL(invoke_4)
+	jmp KMP_LABEL(invoke_3)
+	ALIGN 2
+KMP_LABEL(invoke_4):
+	movl -4(%ebp),%eax
+	subl $4,%ebx			// decrement argc.
+	addl %ebx,%eax			// index into argv.
+	movl (%eax),%edx
+	pushl %edx
+
+	jmp KMP_LABEL(invoke_2)
+	ALIGN 2
+KMP_LABEL(invoke_3):
+	leal 16(%ebp),%eax		// push & tid
+	pushl %eax
+
+	leal 12(%ebp),%eax		// push & gtid
+	pushl %eax
+
+	movl 8(%ebp),%ebx
+	call *%ebx			// call (*pkfn)();
+
+	movl $1,%eax			// return 1;
+
+	movl -12(%ebp),%ebx		// restore %ebx
+	leave
+	KMP_CFI_DEF esp,4
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+
+// kmp_uint64
+// __kmp_hardware_timestamp(void)
+	PROC  __kmp_hardware_timestamp
+	rdtsc
+	ret
+
+	DEBUG_INFO __kmp_hardware_timestamp
+// -- End  __kmp_hardware_timestamp
+
+#endif /* KMP_ARCH_X86 */
+
+
+#if KMP_ARCH_X86_64
+
+// -----------------------------------------------------------------------
+// microtasking routines specifically written for IA-32 architecture and
+// Intel(R) 64 running Linux* OS
+// -----------------------------------------------------------------------
+
+// -- Machine type P
+// mark_description "Intel Corporation";
+	.ident "Intel Corporation"
+// --	.file "z_Linux_asm.S"
+	.data
+	ALIGN 4
+
+// To prevent getting our code into .data section .text added to every routine
+// definition for x86_64.
+//------------------------------------------------------------------------
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_test_then_add32
+//
+// kmp_int32
+// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%esi
+//
+// return:	%eax
+        .text
+        PROC  __kmp_test_then_add32
+
+        movl      %esi, %eax	// "d"
+        lock
+        xaddl     %eax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_test_then_add32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_test_then_add64
+//
+// kmp_int64
+// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%rsi
+//	return:	%rax
+        .text
+        PROC  __kmp_test_then_add64
+
+        movq      %rsi, %rax	// "d"
+        lock
+        xaddq     %rax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_test_then_add64
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed8
+//
+// kmp_int32
+// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%sil
+//
+// return:	%al
+        .text
+        PROC  __kmp_xchg_fixed8
+
+        movb      %sil, %al	// "d"
+
+        lock
+        xchgb     %al,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed16
+//
+// kmp_int16
+// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%si
+// return:     %ax
+        .text
+        PROC  __kmp_xchg_fixed16
+
+        movw      %si, %ax	// "d"
+
+        lock
+        xchgw     %ax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed32
+//
+// kmp_int32
+// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%esi
+//
+// return:	%eax
+        .text
+        PROC  __kmp_xchg_fixed32
+
+        movl      %esi, %eax	// "d"
+
+        lock
+        xchgl     %eax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_fixed64
+//
+// kmp_int64
+// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
+//
+// parameters:
+// 	p:	%rdi
+// 	d:	%rsi
+// return:	%rax
+        .text
+        PROC  __kmp_xchg_fixed64
+
+        movq      %rsi, %rax	// "d"
+
+        lock
+        xchgq     %rax,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_xchg_fixed64
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store8
+//
+// kmp_int8
+// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store8
+
+        movb      %sil, %al	// "cv"
+        lock
+        cmpxchgb  %dl,(%rdi)
+        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store16
+//
+// kmp_int16
+// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%si
+//	sv:	%dx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store16
+
+        movw      %si, %ax	// "cv"
+        lock
+        cmpxchgw  %dx,(%rdi)
+        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store32
+//
+// kmp_int32
+// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store32
+
+        movl      %esi, %eax	// "cv"
+        lock
+        cmpxchgl  %edx,(%rdi)
+        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store64
+//
+// kmp_int32
+// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%rsi
+//	sv:	%rdx
+//	return:	%eax
+        .text
+        PROC  __kmp_compare_and_store64
+
+        movq      %rsi, %rax    // "cv"
+        lock
+        cmpxchgq  %rdx,(%rdi)
+        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
+        andq      $1, %rax      // sign extend previous instruction for return value
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store64
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret8
+//
+// kmp_int8
+// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret8
+
+        movb      %sil, %al	// "cv"
+        lock
+        cmpxchgb  %dl,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret8
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret16
+//
+// kmp_int16
+// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%si
+//	sv:	%dx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret16
+
+        movw      %si, %ax	// "cv"
+        lock
+        cmpxchgw  %dx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret16
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret32
+//
+// kmp_int32
+// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%esi
+//	sv:	%edx
+//
+// return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret32
+
+        movl      %esi, %eax	// "cv"
+        lock
+        cmpxchgl  %edx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_compare_and_store_ret64
+//
+// kmp_int64
+// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+//
+// parameters:
+// 	p:	%rdi
+// 	cv:	%rsi
+//	sv:	%rdx
+//	return:	%eax
+        .text
+        PROC  __kmp_compare_and_store_ret64
+
+        movq      %rsi, %rax    // "cv"
+        lock
+        cmpxchgq  %rdx,(%rdi)
+        ret
+
+        DEBUG_INFO __kmp_compare_and_store_ret64
+
+# endif /* !KMP_ASM_INTRINS */
+
+
+# if !KMP_MIC
+
+# if !KMP_ASM_INTRINS
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_real32
+//
+// kmp_real32
+// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
+//
+// parameters:
+// 	addr:	%rdi
+// 	data:	%xmm0 (lower 4 bytes)
+//
+// return:	%xmm0 (lower 4 bytes)
+        .text
+        PROC  __kmp_xchg_real32
+
+	movd	%xmm0, %eax	// load "data" to eax
+
+         lock
+         xchgl %eax, (%rdi)
+
+	movd	%eax, %xmm0	// load old value into return register
+
+        ret
+
+        DEBUG_INFO __kmp_xchg_real32
+
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_xchg_real64
+//
+// kmp_real64
+// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
+//
+// parameters:
+//      addr:   %rdi
+//      data:   %xmm0 (lower 8 bytes)
+//      return: %xmm0 (lower 8 bytes)
+        .text
+        PROC  __kmp_xchg_real64
+
+	movd	%xmm0, %rax	// load "data" to rax
+
+         lock
+	xchgq  %rax, (%rdi)
+
+	movd	%rax, %xmm0	// load old value into return register
+        ret
+
+        DEBUG_INFO __kmp_xchg_real64
+
+
+# endif /* !KMP_MIC */
+
+# endif /* !KMP_ASM_INTRINS */
+
+//------------------------------------------------------------------------
+// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
+//
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//		           int gtid, int tid,
+//                         int argc, void *p_argv[] ) {
+//    (*pkfn)( & gtid, & tid, argv[0], ... );
+//    return 1;
+// }
+//
+// note: at call to pkfn must have %rsp 128-byte aligned for compiler
+//
+// parameters:
+//      %rdi:  	pkfn
+//	%esi:	gtid
+//	%edx:	tid
+//	%ecx:	argc
+//	%r8:	p_argv
+//	%r9:	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	%rax:	used all over the place
+//	%rdx:	used in stack pointer alignment calculation
+//	%r11:	used to traverse p_argv array
+//	%rsi:	used as temporary for stack parameters
+//		used as temporary for number of pkfn parms to push
+//	%rbx:	used to hold pkfn address, and zero constant, callee-save
+//
+// return:	%eax 	(always 1/TRUE)
+__gtid = -16
+__tid = -24
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+        .text
+	PROC  __kmp_invoke_microtask
+
+	pushq 	%rbp		// save base pointer
+	KMP_CFI_DEF_OFFSET 16
+	KMP_CFI_OFFSET rbp,-16
+	movq 	%rsp,%rbp	// establish the base pointer for this routine.
+	KMP_CFI_REGISTER rbp
+
+#if OMPT_SUPPORT
+	movq	%rbp, (%r9)	// save exit_frame
+#endif
+
+	pushq 	%rbx		// %rbx is callee-saved register
+	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
+	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
+
+	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
+	movq	$0, %rbx	// constant for cmovs later
+	subq	$4, %rax	// subtract four args passed in registers to pkfn
+#if KMP_MIC
+	js	KMP_LABEL(kmp_0)	// jump to movq
+	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
+KMP_LABEL(kmp_0):
+	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+KMP_LABEL(kmp_0_exit):
+#else
+	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
+#endif // KMP_MIC
+
+	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
+	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
+
+	movq 	%rsp, %rdx	//
+	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
+				// without align, stack ptr would be this
+	movq 	%rdx, %rax	// Save to %rax
+
+	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
+	subq 	%rax, %rdx	// Amount to subtract from %rsp
+	subq 	%rdx, %rsp	// Prepare the stack ptr --
+				// now %rsp will align to 128-byte boundary at call site
+
+				// setup pkfn parameter reg and stack
+	movq	%rcx, %rax	// argc -> %rax
+	cmpq	$0, %rsi
+	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
+	shlq	$3, %rcx	// argc*8 -> %rcx
+	movq 	%r8, %rdx	// p_argv -> %rdx
+	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
+
+	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
+
+KMP_LABEL(kmp_invoke_push_parms):
+	// push nth - 7th parms to pkfn on stack
+	subq	$8, %rdx	// decrement p_argv pointer to previous parm
+	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
+	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
+	subl	$1, %ecx
+
+// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
+//		if the name of the label that is an operand of this jecxz starts with a dot (".");
+//	   Apple's linker does not support 1-byte length relocation;
+//         Resolution: replace all .labelX entries with L_labelX.
+
+	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
+	jmp	KMP_LABEL(kmp_invoke_push_parms)
+	ALIGN 3
+KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
+				// order here is important to avoid trashing
+				// registers used for both input and output parms!
+	movq	%rdi, %rbx	// pkfn -> %rbx
+	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
+	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
+
+	movq	%r8, %r11	// p_argv -> %r11
+
+#if KMP_MIC
+	cmpq	$4, %rax	// argc >= 4?
+	jns	KMP_LABEL(kmp_4)	// jump to movq
+	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
+KMP_LABEL(kmp_4):
+	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+KMP_LABEL(kmp_4_exit):
+
+	cmpq	$3, %rax	// argc >= 3?
+	jns	KMP_LABEL(kmp_3)	// jump to movq
+	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
+KMP_LABEL(kmp_3):
+	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+KMP_LABEL(kmp_3_exit):
+
+	cmpq	$2, %rax	// argc >= 2?
+	jns	KMP_LABEL(kmp_2)	// jump to movq
+	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
+KMP_LABEL(kmp_2):
+	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+KMP_LABEL(kmp_2_exit):
+
+	cmpq	$1, %rax	// argc >= 1?
+	jns	KMP_LABEL(kmp_1)	// jump to movq
+	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
+KMP_LABEL(kmp_1):
+	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+KMP_LABEL(kmp_1_exit):
+#else
+	cmpq	$4, %rax	// argc >= 4?
+	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
+
+	cmpq	$3, %rax	// argc >= 3?
+	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
+
+	cmpq	$2, %rax	// argc >= 2?
+	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
+
+	cmpq	$1, %rax	// argc >= 1?
+	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
+#endif // KMP_MIC
+
+	call	*%rbx		// call (*pkfn)();
+	movq	$1, %rax	// move 1 into return register;
+
+	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
+	movq 	%rbp, %rsp	// restore stack pointer
+	popq 	%rbp		// restore frame pointer
+	KMP_CFI_DEF rsp,8
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+// kmp_uint64
+// __kmp_hardware_timestamp(void)
+        .text
+	PROC  __kmp_hardware_timestamp
+	rdtsc
+	shlq    $32, %rdx
+	orq     %rdx, %rax
+	ret
+
+	DEBUG_INFO __kmp_hardware_timestamp
+// -- End  __kmp_hardware_timestamp
+
+//------------------------------------------------------------------------
+// FUNCTION __kmp_bsr32
+//
+// int
+// __kmp_bsr32( int );
+        .text
+        PROC  __kmp_bsr32
+
+        bsr    %edi,%eax
+        ret
+
+        DEBUG_INFO __kmp_bsr32
+
+// -----------------------------------------------------------------------
+#endif /* KMP_ARCH_X86_64 */
+
+// '
+#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
+
+//------------------------------------------------------------------------
+//
+// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
+//
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//		           int gtid, int tid,
+//                         int argc, void *p_argv[] ) {
+//    (*pkfn)( & gtid, & tid, argv[0], ... );
+//    return 1;
+// }
+//
+// parameters:
+//	x0:	pkfn
+//	w1:	gtid
+//	w2:	tid
+//	w3:	argc
+//	x4:	p_argv
+//	x5:	&exit_frame
+//
+// locals:
+//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
+//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
+//
+// reg temps:
+//	 x8:	used to hold pkfn address
+//	 w9:	used as temporary for number of pkfn parms
+//	x10:	used to traverse p_argv array
+//	x11:	used as temporary for stack placement calculation
+//	x12:	used as temporary for stack parameters
+//	x19:	used to preserve exit_frame_ptr, callee-save
+//
+// return:	w0	(always 1/TRUE)
+//
+
+__gtid = 4
+__tid = 8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	PROC __kmp_invoke_microtask
+
+	stp	x29, x30, [sp, #-16]!
+# if OMPT_SUPPORT
+	stp	x19, x20, [sp, #-16]!
+# endif
+	mov	x29, sp
+
+	orr	w9, wzr, #1
+	add	w9, w9, w3, lsr #1
+	sub	sp, sp, w9, uxtw #4
+	mov	x11, sp
+
+	mov	x8, x0
+	str	w1, [x29, #-__gtid]
+	str	w2, [x29, #-__tid]
+	mov	w9, w3
+	mov	x10, x4
+# if OMPT_SUPPORT
+	mov	x19, x5
+	str	x29, [x19]
+# endif
+
+	sub	x0, x29, #__gtid
+	sub	x1, x29, #__tid
+
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x2, [x10]
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x3, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x4, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x5, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x6, [x10, #8]!
+
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x7, [x10, #8]!
+
+KMP_LABEL(kmp_0):
+	sub	w9, w9, #1
+	cbz	w9, KMP_LABEL(kmp_1)
+	ldr	x12, [x10, #8]!
+	str	x12, [x11], #8
+	b	KMP_LABEL(kmp_0)
+KMP_LABEL(kmp_1):
+	blr	x8
+	orr	w0, wzr, #1
+	mov	sp, x29
+# if OMPT_SUPPORT
+	str	xzr, [x19]
+	ldp	x19, x20, [sp], #16
+# endif
+	ldp	x29, x30, [sp], #16
+	ret
+
+	DEBUG_INFO __kmp_invoke_microtask
+// -- End  __kmp_invoke_microtask
+
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
+
+#if KMP_ARCH_PPC64
+
+//------------------------------------------------------------------------
+//
+// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
+//
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//		           int gtid, int tid,
+//                         int argc, void *p_argv[] ) {
+//    (*pkfn)( & gtid, & tid, argv[0], ... );
+//    return 1;
+// }
+//
+// parameters:
+//	r3:	pkfn
+//	r4:	gtid
+//	r5:	tid
+//	r6:	argc
+//	r7:	p_argv
+//	r8:	&exit_frame
+//
+// return:	r3	(always 1/TRUE)
+//
+	.text
+# if KMP_ARCH_PPC64_LE
+	.abiversion 2
+# endif
+	.globl	__kmp_invoke_microtask
+
+# if KMP_ARCH_PPC64_LE
+	.p2align	4
+# else
+	.p2align	2
+# endif
+
+	.type	__kmp_invoke_microtask,@function
+
+# if KMP_ARCH_PPC64_LE
+__kmp_invoke_microtask:
+.Lfunc_begin0:
+.Lfunc_gep0:
+	addis 2, 12, .TOC.-.Lfunc_gep0@ha
+	addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
+# else
+	.section	.opd,"aw",@progbits
+__kmp_invoke_microtask:
+	.p2align	3
+	.quad	.Lfunc_begin0
+	.quad	.TOC.@tocbase
+	.quad	0
+	.text
+.Lfunc_begin0:
+# endif
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+
+// We need to allocate a stack frame large enough to hold all of the parameters
+// on the stack for the microtask plus what this function needs. That's 48
+// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
+// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
+// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
+// to save r30 to hold a copy of r8.
+
+	.cfi_startproc
+	mflr 0
+	std 31, -8(1)
+	std 0, 16(1)
+
+// This is unusual because normally we'd set r31 equal to r1 after the stack
+// frame is established. In this case, however, we need to dynamically compute
+// the stack frame size, and so we keep a direct copy of r1 to access our
+// register save areas and restore the r1 value before returning.
+	mr 31, 1
+	.cfi_def_cfa_register r31
+	.cfi_offset r31, -8
+	.cfi_offset lr, 16
+
+// Compute the size necessary for the local stack frame.
+# if KMP_ARCH_PPC64_LE
+	li 12, 72
+# else
+	li 12, 88
+# endif
+	sldi 0, 6, 3
+	add 12, 0, 12
+	neg 12, 12
+
+// We need to make sure that the stack frame stays aligned (to 16 bytes, except
+// under the BG/Q CNK, where it must be to 32 bytes).
+# if KMP_OS_CNK
+	li 0, -32
+# else
+	li 0, -16
+# endif
+	and 12, 0, 12
+
+// Establish the local stack frame.
+	stdux 1, 1, 12
+
+# if OMPT_SUPPORT
+	.cfi_offset r30, -16
+	std 30, -16(31)
+	std 1, 0(8)
+	mr 30, 8
+# endif
+
+// Store gtid and tid to the stack because they're passed by reference to the microtask.
+	stw 4, -20(31)
+	stw 5, -24(31)
+
+	mr 12, 6
+	mr 4, 7
+
+	cmpwi 0, 12, 1
+	blt	 0, .Lcall
+
+	ld 5, 0(4)
+
+	cmpwi 0, 12, 2
+	blt	 0, .Lcall
+
+	ld 6, 8(4)
+
+	cmpwi 0, 12, 3
+	blt	 0, .Lcall
+
+	ld 7, 16(4)
+
+	cmpwi 0, 12, 4
+	blt	 0, .Lcall
+
+	ld 8, 24(4)
+
+	cmpwi 0, 12, 5
+	blt	 0, .Lcall
+
+	ld 9, 32(4)
+
+	cmpwi 0, 12, 6
+	blt	 0, .Lcall
+
+	ld 10, 40(4)
+
+	cmpwi 0, 12, 7
+	blt	 0, .Lcall
+
+// There are more than 6 microtask parameters, so we need to store the
+// remainder to the stack.
+	addi 12, 12, -6
+	mtctr 12
+
+// These are set to 8 bytes before the first desired store address (we're using
+// pre-increment loads and stores in the loop below). The parameter save area
+// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
+// 32 + 8*8 == 96 bytes above r1 for ELFv2.
+	addi 4, 4, 40
+# if KMP_ARCH_PPC64_LE
+	addi 12, 1, 88
+# else
+	addi 12, 1, 104
+# endif
+
+.Lnext:
+	ldu 0, 8(4)
+	stdu 0, 8(12)
+	bdnz .Lnext
+
+.Lcall:
+# if KMP_ARCH_PPC64_LE
+	std 2, 24(1)
+	mr 12, 3
+#else
+	std 2, 40(1)
+// For ELFv1, we need to load the actual function address from the function descriptor.
+	ld 12, 0(3)
+	ld 2, 8(3)
+	ld 11, 16(3)
+#endif
+
+	addi 3, 31, -20
+	addi 4, 31, -24
+
+	mtctr 12
+	bctrl
+# if KMP_ARCH_PPC64_LE
+	ld 2, 24(1)
+# else
+	ld 2, 40(1)
+# endif
+
+# if OMPT_SUPPORT
+	li 3, 0
+	std 3, 0(30)
+# endif
+
+	li 3, 1
+
+# if OMPT_SUPPORT
+	ld 30, -16(31)
+# endif
+
+	mr 1, 31
+	ld 0, 16(1)
+	ld 31, -8(1)
+	mtlr 0
+	blr
+
+	.long	0
+	.quad	0
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_PPC64 */
+
+#if KMP_ARCH_ARM || KMP_ARCH_MIPS
+    .data
+    .comm .gomp_critical_user_,32,8
+    .data
+    .align 4
+    .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+    .4byte .gomp_critical_user_
+    .size __kmp_unnamed_critical_addr,4
+#endif /* KMP_ARCH_ARM */
+
+#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
+    .data
+    .comm .gomp_critical_user_,32,8
+    .data
+    .align 8
+    .global __kmp_unnamed_critical_addr
+__kmp_unnamed_critical_addr:
+    .8byte .gomp_critical_user_
+    .size __kmp_unnamed_critical_addr,8
+#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
+
+#if KMP_OS_LINUX
+# if KMP_ARCH_ARM
+.section .note.GNU-stack,"",%progbits
+# else
+.section .note.GNU-stack,"",@progbits
+# endif
+#endif
diff --git a/final/runtime/src/z_Linux_util.cpp b/final/runtime/src/z_Linux_util.cpp
new file mode 100644
index 0000000..1983fc2
--- /dev/null
+++ b/final/runtime/src/z_Linux_util.cpp
@@ -0,0 +1,2427 @@
+/*
+ * z_Linux_util.cpp -- platform specific routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#include "kmp_wait_release.h"
+#include "kmp_wrapper_getpid.h"
+
+#if !KMP_OS_DRAGONFLY && !KMP_OS_FREEBSD && !KMP_OS_NETBSD && !KMP_OS_OPENBSD
+#include <alloca.h>
+#endif
+#include <math.h> // HUGE_VAL.
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/times.h>
+#include <unistd.h>
+
+#if KMP_OS_LINUX && !KMP_OS_CNK
+#include <sys/sysinfo.h>
+#if KMP_USE_FUTEX
+// We should really include <futex.h>, but that causes compatibility problems on
+// different Linux* OS distributions that either require that you include (or
+// break when you try to include) <pci/types.h>. Since all we need is the two
+// macros below (which are part of the kernel ABI, so can't change) we just
+// define the constants here and don't include <futex.h>
+#ifndef FUTEX_WAIT
+#define FUTEX_WAIT 0
+#endif
+#ifndef FUTEX_WAKE
+#define FUTEX_WAKE 1
+#endif
+#endif
+#elif KMP_OS_DARWIN
+#include <mach/mach.h>
+#include <sys/sysctl.h>
+#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD
+#include <pthread_np.h>
+#elif KMP_OS_NETBSD
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+
+#include "tsan_annotations.h"
+
+struct kmp_sys_timer {
+  struct timespec start;
+};
+
+// Convert timespec to nanoseconds.
+#define TS2NS(timespec) (((timespec).tv_sec * 1e9) + (timespec).tv_nsec)
+
+static struct kmp_sys_timer __kmp_sys_timer_data;
+
+#if KMP_HANDLE_SIGNALS
+typedef void (*sig_func_t)(int);
+STATIC_EFI2_WORKAROUND struct sigaction __kmp_sighldrs[NSIG];
+static sigset_t __kmp_sigset;
+#endif
+
+static int __kmp_init_runtime = FALSE;
+
+static int __kmp_fork_count = 0;
+
+static pthread_condattr_t __kmp_suspend_cond_attr;
+static pthread_mutexattr_t __kmp_suspend_mutex_attr;
+
+static kmp_cond_align_t __kmp_wait_cv;
+static kmp_mutex_align_t __kmp_wait_mx;
+
+kmp_uint64 __kmp_ticks_per_msec = 1000000;
+
+#ifdef DEBUG_SUSPEND
+static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
+  KMP_SNPRINTF(buffer, 128, "(cond (lock (%ld, %d)), (descr (%p)))",
+               cond->c_cond.__c_lock.__status, cond->c_cond.__c_lock.__spinlock,
+               cond->c_cond.__c_waiting);
+}
+#endif
+
+#if (KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED)
+
+/* Affinity support */
+
+void __kmp_affinity_bind_thread(int which) {
+  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+              "Illegal set affinity operation when not capable");
+
+  kmp_affin_mask_t *mask;
+  KMP_CPU_ALLOC_ON_STACK(mask);
+  KMP_CPU_ZERO(mask);
+  KMP_CPU_SET(which, mask);
+  __kmp_set_system_affinity(mask, TRUE);
+  KMP_CPU_FREE_FROM_STACK(mask);
+}
+
+/* Determine if we can access affinity functionality on this version of
+ * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
+ * __kmp_affin_mask_size to the appropriate value (0 means not capable). */
+void __kmp_affinity_determine_capable(const char *env_var) {
+// Check and see if the OS supports thread affinity.
+
+#define KMP_CPU_SET_SIZE_LIMIT (1024 * 1024)
+
+  int gCode;
+  int sCode;
+  unsigned char *buf;
+  buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
+
+  // If Linux* OS:
+  // If the syscall fails or returns a suggestion for the size,
+  // then we don't have to search for an appropriate size.
+  gCode = syscall(__NR_sched_getaffinity, 0, KMP_CPU_SET_SIZE_LIMIT, buf);
+  KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                "initial getaffinity call returned %d errno = %d\n",
+                gCode, errno));
+
+  // if ((gCode < 0) && (errno == ENOSYS))
+  if (gCode < 0) {
+    // System call not supported
+    if (__kmp_affinity_verbose ||
+        (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none) &&
+         (__kmp_affinity_type != affinity_default) &&
+         (__kmp_affinity_type != affinity_disabled))) {
+      int error = errno;
+      kmp_msg_t err_code = KMP_ERR(error);
+      __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
+                err_code, __kmp_msg_null);
+      if (__kmp_generate_warnings == kmp_warnings_off) {
+        __kmp_str_free(&err_code.str);
+      }
+    }
+    KMP_AFFINITY_DISABLE();
+    KMP_INTERNAL_FREE(buf);
+    return;
+  }
+  if (gCode > 0) { // Linux* OS only
+    // The optimal situation: the OS returns the size of the buffer it expects.
+    //
+    // A verification of correct behavior is that Isetaffinity on a NULL
+    // buffer with the same size fails with errno set to EFAULT.
+    sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
+    KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                  "setaffinity for mask size %d returned %d errno = %d\n",
+                  gCode, sCode, errno));
+    if (sCode < 0) {
+      if (errno == ENOSYS) {
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none) &&
+             (__kmp_affinity_type != affinity_default) &&
+             (__kmp_affinity_type != affinity_disabled))) {
+          int error = errno;
+          kmp_msg_t err_code = KMP_ERR(error);
+          __kmp_msg(kmp_ms_warning, KMP_MSG(SetAffSysCallNotSupported, env_var),
+                    err_code, __kmp_msg_null);
+          if (__kmp_generate_warnings == kmp_warnings_off) {
+            __kmp_str_free(&err_code.str);
+          }
+        }
+        KMP_AFFINITY_DISABLE();
+        KMP_INTERNAL_FREE(buf);
+      }
+      if (errno == EFAULT) {
+        KMP_AFFINITY_ENABLE(gCode);
+        KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                      "affinity supported (mask size %d)\n",
+                      (int)__kmp_affin_mask_size));
+        KMP_INTERNAL_FREE(buf);
+        return;
+      }
+    }
+  }
+
+  // Call the getaffinity system call repeatedly with increasing set sizes
+  // until we succeed, or reach an upper bound on the search.
+  KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                "searching for proper set size\n"));
+  int size;
+  for (size = 1; size <= KMP_CPU_SET_SIZE_LIMIT; size *= 2) {
+    gCode = syscall(__NR_sched_getaffinity, 0, size, buf);
+    KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                  "getaffinity for mask size %d returned %d errno = %d\n",
+                  size, gCode, errno));
+
+    if (gCode < 0) {
+      if (errno == ENOSYS) {
+        // We shouldn't get here
+        KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                      "inconsistent OS call behavior: errno == ENOSYS for mask "
+                      "size %d\n",
+                      size));
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none) &&
+             (__kmp_affinity_type != affinity_default) &&
+             (__kmp_affinity_type != affinity_disabled))) {
+          int error = errno;
+          kmp_msg_t err_code = KMP_ERR(error);
+          __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
+                    err_code, __kmp_msg_null);
+          if (__kmp_generate_warnings == kmp_warnings_off) {
+            __kmp_str_free(&err_code.str);
+          }
+        }
+        KMP_AFFINITY_DISABLE();
+        KMP_INTERNAL_FREE(buf);
+        return;
+      }
+      continue;
+    }
+
+    sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
+    KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                  "setaffinity for mask size %d returned %d errno = %d\n",
+                  gCode, sCode, errno));
+    if (sCode < 0) {
+      if (errno == ENOSYS) { // Linux* OS only
+        // We shouldn't get here
+        KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                      "inconsistent OS call behavior: errno == ENOSYS for mask "
+                      "size %d\n",
+                      size));
+        if (__kmp_affinity_verbose ||
+            (__kmp_affinity_warnings &&
+             (__kmp_affinity_type != affinity_none) &&
+             (__kmp_affinity_type != affinity_default) &&
+             (__kmp_affinity_type != affinity_disabled))) {
+          int error = errno;
+          kmp_msg_t err_code = KMP_ERR(error);
+          __kmp_msg(kmp_ms_warning, KMP_MSG(SetAffSysCallNotSupported, env_var),
+                    err_code, __kmp_msg_null);
+          if (__kmp_generate_warnings == kmp_warnings_off) {
+            __kmp_str_free(&err_code.str);
+          }
+        }
+        KMP_AFFINITY_DISABLE();
+        KMP_INTERNAL_FREE(buf);
+        return;
+      }
+      if (errno == EFAULT) {
+        KMP_AFFINITY_ENABLE(gCode);
+        KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                      "affinity supported (mask size %d)\n",
+                      (int)__kmp_affin_mask_size));
+        KMP_INTERNAL_FREE(buf);
+        return;
+      }
+    }
+  }
+  // save uncaught error code
+  // int error = errno;
+  KMP_INTERNAL_FREE(buf);
+  // restore uncaught error code, will be printed at the next KMP_WARNING below
+  // errno = error;
+
+  // Affinity is not supported
+  KMP_AFFINITY_DISABLE();
+  KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                "cannot determine mask size - affinity not supported\n"));
+  if (__kmp_affinity_verbose ||
+      (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none) &&
+       (__kmp_affinity_type != affinity_default) &&
+       (__kmp_affinity_type != affinity_disabled))) {
+    KMP_WARNING(AffCantGetMaskSize, env_var);
+  }
+}
+
+#endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+
+#if KMP_USE_FUTEX
+
+int __kmp_futex_determine_capable() {
+  int loc = 0;
+  int rc = syscall(__NR_futex, &loc, FUTEX_WAKE, 1, NULL, NULL, 0);
+  int retval = (rc == 0) || (errno != ENOSYS);
+
+  KA_TRACE(10,
+           ("__kmp_futex_determine_capable: rc = %d errno = %d\n", rc, errno));
+  KA_TRACE(10, ("__kmp_futex_determine_capable: futex syscall%s supported\n",
+                retval ? "" : " not"));
+
+  return retval;
+}
+
+#endif // KMP_USE_FUTEX
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (!KMP_ASM_INTRINS)
+/* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
+   use compare_and_store for these routines */
+
+kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value | d;
+
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value & d;
+
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 d) {
+  kmp_uint32 old_value, new_value;
+
+  old_value = TCR_4(*p);
+  new_value = old_value | d;
+
+  while (!KMP_COMPARE_AND_STORE_REL32(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_4(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) {
+  kmp_uint32 old_value, new_value;
+
+  old_value = TCR_4(*p);
+  new_value = old_value & d;
+
+  while (!KMP_COMPARE_AND_STORE_REL32(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_4(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+#if KMP_ARCH_X86
+kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value + d;
+
+  while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value + d;
+  }
+  return old_value;
+}
+
+kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 d) {
+  kmp_int64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value + d;
+
+  while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value + d;
+  }
+  return old_value;
+}
+#endif /* KMP_ARCH_X86 */
+
+kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 d) {
+  kmp_uint64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value | d;
+  while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) {
+  kmp_uint64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value & d;
+  while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+#endif /* (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (! KMP_ASM_INTRINS) */
+
+void __kmp_terminate_thread(int gtid) {
+  int status;
+  kmp_info_t *th = __kmp_threads[gtid];
+
+  if (!th)
+    return;
+
+#ifdef KMP_CANCEL_THREADS
+  KA_TRACE(10, ("__kmp_terminate_thread: kill (%d)\n", gtid));
+  status = pthread_cancel(th->th.th_info.ds.ds_thread);
+  if (status != 0 && status != ESRCH) {
+    __kmp_fatal(KMP_MSG(CantTerminateWorkerThread), KMP_ERR(status),
+                __kmp_msg_null);
+  }
+#endif
+  KMP_YIELD(TRUE);
+} //
+
+/* Set thread stack info according to values returned by pthread_getattr_np().
+   If values are unreasonable, assume call failed and use incremental stack
+   refinement method instead. Returns TRUE if the stack parameters could be
+   determined exactly, FALSE if incremental refinement is necessary. */
+static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
+  int stack_data;
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+        KMP_OS_HURD
+  pthread_attr_t attr;
+  int status;
+  size_t size = 0;
+  void *addr = 0;
+
+  /* Always do incremental stack refinement for ubermaster threads since the
+     initial thread stack range can be reduced by sibling thread creation so
+     pthread_attr_getstack may cause thread gtid aliasing */
+  if (!KMP_UBER_GTID(gtid)) {
+
+    /* Fetch the real thread attributes */
+    status = pthread_attr_init(&attr);
+    KMP_CHECK_SYSFAIL("pthread_attr_init", status);
+#if KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD
+    status = pthread_attr_get_np(pthread_self(), &attr);
+    KMP_CHECK_SYSFAIL("pthread_attr_get_np", status);
+#else
+    status = pthread_getattr_np(pthread_self(), &attr);
+    KMP_CHECK_SYSFAIL("pthread_getattr_np", status);
+#endif
+    status = pthread_attr_getstack(&attr, &addr, &size);
+    KMP_CHECK_SYSFAIL("pthread_attr_getstack", status);
+    KA_TRACE(60,
+             ("__kmp_set_stack_info: T#%d pthread_attr_getstack returned size:"
+              " %lu, low addr: %p\n",
+              gtid, size, addr));
+    status = pthread_attr_destroy(&attr);
+    KMP_CHECK_SYSFAIL("pthread_attr_destroy", status);
+  }
+
+  if (size != 0 && addr != 0) { // was stack parameter determination successful?
+    /* Store the correct base and size */
+    TCW_PTR(th->th.th_info.ds.ds_stackbase, (((char *)addr) + size));
+    TCW_PTR(th->th.th_info.ds.ds_stacksize, size);
+    TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
+    return TRUE;
+  }
+#endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
+              KMP_OS_HURD */
+  /* Use incremental refinement starting from initial conservative estimate */
+  TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
+  TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
+  TCW_4(th->th.th_info.ds.ds_stackgrow, TRUE);
+  return FALSE;
+}
+
+static void *__kmp_launch_worker(void *thr) {
+  int status, old_type, old_state;
+#ifdef KMP_BLOCK_SIGNALS
+  sigset_t new_set, old_set;
+#endif /* KMP_BLOCK_SIGNALS */
+  void *exit_val;
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+        KMP_OS_OPENBSD || KMP_OS_HURD
+  void *volatile padding = 0;
+#endif
+  int gtid;
+
+  gtid = ((kmp_info_t *)thr)->th.th_info.ds.ds_gtid;
+  __kmp_gtid_set_specific(gtid);
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = gtid;
+#endif
+#if KMP_STATS_ENABLED
+  // set thread local index to point to thread-specific stats
+  __kmp_stats_thread_ptr = ((kmp_info_t *)thr)->th.th_stats;
+  __kmp_stats_thread_ptr->startLife();
+  KMP_SET_THREAD_STATE(IDLE);
+  KMP_INIT_PARTITIONED_TIMERS(OMP_idle);
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_thread_name(gtid);
+#endif /* USE_ITT_BUILD */
+
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_affinity_set_init_mask(gtid, FALSE);
+#endif
+
+#ifdef KMP_CANCEL_THREADS
+  status = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old_type);
+  KMP_CHECK_SYSFAIL("pthread_setcanceltype", status);
+  // josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads?
+  status = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  // Set FP control regs to be a copy of the parallel initialization thread's.
+  __kmp_clear_x87_fpu_status_word();
+  __kmp_load_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
+  __kmp_load_mxcsr(&__kmp_init_mxcsr);
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+#ifdef KMP_BLOCK_SIGNALS
+  status = sigfillset(&new_set);
+  KMP_CHECK_SYSFAIL_ERRNO("sigfillset", status);
+  status = pthread_sigmask(SIG_BLOCK, &new_set, &old_set);
+  KMP_CHECK_SYSFAIL("pthread_sigmask", status);
+#endif /* KMP_BLOCK_SIGNALS */
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+        KMP_OS_OPENBSD
+  if (__kmp_stkoffset > 0 && gtid > 0) {
+    padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
+  }
+#endif
+
+  KMP_MB();
+  __kmp_set_stack_info(gtid, (kmp_info_t *)thr);
+
+  __kmp_check_stack_overlap((kmp_info_t *)thr);
+
+  exit_val = __kmp_launch_thread((kmp_info_t *)thr);
+
+#ifdef KMP_BLOCK_SIGNALS
+  status = pthread_sigmask(SIG_SETMASK, &old_set, NULL);
+  KMP_CHECK_SYSFAIL("pthread_sigmask", status);
+#endif /* KMP_BLOCK_SIGNALS */
+
+  return exit_val;
+}
+
+#if KMP_USE_MONITOR
+/* The monitor thread controls all of the threads in the complex */
+
+static void *__kmp_launch_monitor(void *thr) {
+  int status, old_type, old_state;
+#ifdef KMP_BLOCK_SIGNALS
+  sigset_t new_set;
+#endif /* KMP_BLOCK_SIGNALS */
+  struct timespec interval;
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #1 launched\n"));
+
+  /* register us as the monitor thread */
+  __kmp_gtid_set_specific(KMP_GTID_MONITOR);
+#ifdef KMP_TDATA_GTID
+  __kmp_gtid = KMP_GTID_MONITOR;
+#endif
+
+  KMP_MB();
+
+#if USE_ITT_BUILD
+  // Instruct Intel(R) Threading Tools to ignore monitor thread.
+  __kmp_itt_thread_ignore();
+#endif /* USE_ITT_BUILD */
+
+  __kmp_set_stack_info(((kmp_info_t *)thr)->th.th_info.ds.ds_gtid,
+                       (kmp_info_t *)thr);
+
+  __kmp_check_stack_overlap((kmp_info_t *)thr);
+
+#ifdef KMP_CANCEL_THREADS
+  status = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old_type);
+  KMP_CHECK_SYSFAIL("pthread_setcanceltype", status);
+  // josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads?
+  status = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+#endif
+
+#if KMP_REAL_TIME_FIX
+  // This is a potential fix which allows application with real-time scheduling
+  // policy work. However, decision about the fix is not made yet, so it is
+  // disabled by default.
+  { // Are program started with real-time scheduling policy?
+    int sched = sched_getscheduler(0);
+    if (sched == SCHED_FIFO || sched == SCHED_RR) {
+      // Yes, we are a part of real-time application. Try to increase the
+      // priority of the monitor.
+      struct sched_param param;
+      int max_priority = sched_get_priority_max(sched);
+      int rc;
+      KMP_WARNING(RealTimeSchedNotSupported);
+      sched_getparam(0, &param);
+      if (param.sched_priority < max_priority) {
+        param.sched_priority += 1;
+        rc = sched_setscheduler(0, sched, &param);
+        if (rc != 0) {
+          int error = errno;
+          kmp_msg_t err_code = KMP_ERR(error);
+          __kmp_msg(kmp_ms_warning, KMP_MSG(CantChangeMonitorPriority),
+                    err_code, KMP_MSG(MonitorWillStarve), __kmp_msg_null);
+          if (__kmp_generate_warnings == kmp_warnings_off) {
+            __kmp_str_free(&err_code.str);
+          }
+        }
+      } else {
+        // We cannot abort here, because number of CPUs may be enough for all
+        // the threads, including the monitor thread, so application could
+        // potentially work...
+        __kmp_msg(kmp_ms_warning, KMP_MSG(RunningAtMaxPriority),
+                  KMP_MSG(MonitorWillStarve), KMP_HNT(RunningAtMaxPriority),
+                  __kmp_msg_null);
+      }
+    }
+    // AC: free thread that waits for monitor started
+    TCW_4(__kmp_global.g.g_time.dt.t_value, 0);
+  }
+#endif // KMP_REAL_TIME_FIX
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  if (__kmp_monitor_wakeups == 1) {
+    interval.tv_sec = 1;
+    interval.tv_nsec = 0;
+  } else {
+    interval.tv_sec = 0;
+    interval.tv_nsec = (KMP_NSEC_PER_SEC / __kmp_monitor_wakeups);
+  }
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #2 monitor\n"));
+
+  while (!TCR_4(__kmp_global.g.g_done)) {
+    struct timespec now;
+    struct timeval tval;
+
+    /*  This thread monitors the state of the system */
+
+    KA_TRACE(15, ("__kmp_launch_monitor: update\n"));
+
+    status = gettimeofday(&tval, NULL);
+    KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+    TIMEVAL_TO_TIMESPEC(&tval, &now);
+
+    now.tv_sec += interval.tv_sec;
+    now.tv_nsec += interval.tv_nsec;
+
+    if (now.tv_nsec >= KMP_NSEC_PER_SEC) {
+      now.tv_sec += 1;
+      now.tv_nsec -= KMP_NSEC_PER_SEC;
+    }
+
+    status = pthread_mutex_lock(&__kmp_wait_mx.m_mutex);
+    KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+    // AC: the monitor should not fall asleep if g_done has been set
+    if (!TCR_4(__kmp_global.g.g_done)) { // check once more under mutex
+      status = pthread_cond_timedwait(&__kmp_wait_cv.c_cond,
+                                      &__kmp_wait_mx.m_mutex, &now);
+      if (status != 0) {
+        if (status != ETIMEDOUT && status != EINTR) {
+          KMP_SYSFAIL("pthread_cond_timedwait", status);
+        }
+      }
+    }
+    status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
+    KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+
+    TCW_4(__kmp_global.g.g_time.dt.t_value,
+          TCR_4(__kmp_global.g.g_time.dt.t_value) + 1);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #3 cleanup\n"));
+
+#ifdef KMP_BLOCK_SIGNALS
+  status = sigfillset(&new_set);
+  KMP_CHECK_SYSFAIL_ERRNO("sigfillset", status);
+  status = pthread_sigmask(SIG_UNBLOCK, &new_set, NULL);
+  KMP_CHECK_SYSFAIL("pthread_sigmask", status);
+#endif /* KMP_BLOCK_SIGNALS */
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #4 finished\n"));
+
+  if (__kmp_global.g.g_abort != 0) {
+    /* now we need to terminate the worker threads  */
+    /* the value of t_abort is the signal we caught */
+
+    int gtid;
+
+    KA_TRACE(10, ("__kmp_launch_monitor: #5 terminate sig=%d\n",
+                  __kmp_global.g.g_abort));
+
+    /* terminate the OpenMP worker threads */
+    /* TODO this is not valid for sibling threads!!
+     * the uber master might not be 0 anymore.. */
+    for (gtid = 1; gtid < __kmp_threads_capacity; ++gtid)
+      __kmp_terminate_thread(gtid);
+
+    __kmp_cleanup();
+
+    KA_TRACE(10, ("__kmp_launch_monitor: #6 raise sig=%d\n",
+                  __kmp_global.g.g_abort));
+
+    if (__kmp_global.g.g_abort > 0)
+      raise(__kmp_global.g.g_abort);
+  }
+
+  KA_TRACE(10, ("__kmp_launch_monitor: #7 exit\n"));
+
+  return thr;
+}
+#endif // KMP_USE_MONITOR
+
+void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
+  pthread_t handle;
+  pthread_attr_t thread_attr;
+  int status;
+
+  th->th.th_info.ds.ds_gtid = gtid;
+
+#if KMP_STATS_ENABLED
+  // sets up worker thread stats
+  __kmp_acquire_tas_lock(&__kmp_stats_lock, gtid);
+
+  // th->th.th_stats is used to transfer thread-specific stats-pointer to
+  // __kmp_launch_worker. So when thread is created (goes into
+  // __kmp_launch_worker) it will set its thread local pointer to
+  // th->th.th_stats
+  if (!KMP_UBER_GTID(gtid)) {
+    th->th.th_stats = __kmp_stats_list->push_back(gtid);
+  } else {
+    // For root threads, __kmp_stats_thread_ptr is set in __kmp_register_root(),
+    // so set the th->th.th_stats field to it.
+    th->th.th_stats = __kmp_stats_thread_ptr;
+  }
+  __kmp_release_tas_lock(&__kmp_stats_lock, gtid);
+
+#endif // KMP_STATS_ENABLED
+
+  if (KMP_UBER_GTID(gtid)) {
+    KA_TRACE(10, ("__kmp_create_worker: uber thread (%d)\n", gtid));
+    th->th.th_info.ds.ds_thread = pthread_self();
+    __kmp_set_stack_info(gtid, th);
+    __kmp_check_stack_overlap(th);
+    return;
+  }
+
+  KA_TRACE(10, ("__kmp_create_worker: try to create thread (%d)\n", gtid));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+#ifdef KMP_THREAD_ATTR
+  status = pthread_attr_init(&thread_attr);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantInitThreadAttrs), KMP_ERR(status), __kmp_msg_null);
+  }
+  status = pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantSetWorkerState), KMP_ERR(status), __kmp_msg_null);
+  }
+
+  /* Set stack size for this thread now.
+     The multiple of 2 is there because on some machines, requesting an unusual
+     stacksize causes the thread to have an offset before the dummy alloca()
+     takes place to create the offset.  Since we want the user to have a
+     sufficient stacksize AND support a stack offset, we alloca() twice the
+     offset so that the upcoming alloca() does not eliminate any premade offset,
+     and also gives the user the stack space they requested for all threads */
+  stack_size += gtid * __kmp_stkoffset * 2;
+
+  KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
+                "__kmp_stksize = %lu bytes, final stacksize = %lu bytes\n",
+                gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size));
+
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+  status = pthread_attr_setstacksize(&thread_attr, stack_size);
+#ifdef KMP_BACKUP_STKSIZE
+  if (status != 0) {
+    if (!__kmp_env_stksize) {
+      stack_size = KMP_BACKUP_STKSIZE + gtid * __kmp_stkoffset;
+      __kmp_stksize = KMP_BACKUP_STKSIZE;
+      KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
+                    "__kmp_stksize = %lu bytes, (backup) final stacksize = %lu "
+                    "bytes\n",
+                    gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size));
+      status = pthread_attr_setstacksize(&thread_attr, stack_size);
+    }
+  }
+#endif /* KMP_BACKUP_STKSIZE */
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
+                KMP_HNT(ChangeWorkerStackSize), __kmp_msg_null);
+  }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+
+#endif /* KMP_THREAD_ATTR */
+
+  status =
+      pthread_create(&handle, &thread_attr, __kmp_launch_worker, (void *)th);
+  if (status != 0 || !handle) { // ??? Why do we check handle??
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+    if (status == EINVAL) {
+      __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
+                  KMP_HNT(IncreaseWorkerStackSize), __kmp_msg_null);
+    }
+    if (status == ENOMEM) {
+      __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
+                  KMP_HNT(DecreaseWorkerStackSize), __kmp_msg_null);
+    }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+    if (status == EAGAIN) {
+      __kmp_fatal(KMP_MSG(NoResourcesForWorkerThread), KMP_ERR(status),
+                  KMP_HNT(Decrease_NUM_THREADS), __kmp_msg_null);
+    }
+    KMP_SYSFAIL("pthread_create", status);
+  }
+
+  th->th.th_info.ds.ds_thread = handle;
+
+#ifdef KMP_THREAD_ATTR
+  status = pthread_attr_destroy(&thread_attr);
+  if (status) {
+    kmp_msg_t err_code = KMP_ERR(status);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(CantDestroyThreadAttrs), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  }
+#endif /* KMP_THREAD_ATTR */
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(10, ("__kmp_create_worker: done creating thread (%d)\n", gtid));
+
+} // __kmp_create_worker
+
+#if KMP_USE_MONITOR
+void __kmp_create_monitor(kmp_info_t *th) {
+  pthread_t handle;
+  pthread_attr_t thread_attr;
+  size_t size;
+  int status;
+  int auto_adj_size = FALSE;
+
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+    // We don't need monitor thread in case of MAX_BLOCKTIME
+    KA_TRACE(10, ("__kmp_create_monitor: skipping monitor thread because of "
+                  "MAX blocktime\n"));
+    th->th.th_info.ds.ds_tid = 0; // this makes reap_monitor no-op
+    th->th.th_info.ds.ds_gtid = 0;
+    return;
+  }
+  KA_TRACE(10, ("__kmp_create_monitor: try to create monitor\n"));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  th->th.th_info.ds.ds_tid = KMP_GTID_MONITOR;
+  th->th.th_info.ds.ds_gtid = KMP_GTID_MONITOR;
+#if KMP_REAL_TIME_FIX
+  TCW_4(__kmp_global.g.g_time.dt.t_value,
+        -1); // Will use it for synchronization a bit later.
+#else
+  TCW_4(__kmp_global.g.g_time.dt.t_value, 0);
+#endif // KMP_REAL_TIME_FIX
+
+#ifdef KMP_THREAD_ATTR
+  if (__kmp_monitor_stksize == 0) {
+    __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+    auto_adj_size = TRUE;
+  }
+  status = pthread_attr_init(&thread_attr);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantInitThreadAttrs), KMP_ERR(status), __kmp_msg_null);
+  }
+  status = pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(CantSetMonitorState), KMP_ERR(status), __kmp_msg_null);
+  }
+
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+  status = pthread_attr_getstacksize(&thread_attr, &size);
+  KMP_CHECK_SYSFAIL("pthread_attr_getstacksize", status);
+#else
+  size = __kmp_sys_min_stksize;
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+#endif /* KMP_THREAD_ATTR */
+
+  if (__kmp_monitor_stksize == 0) {
+    __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+  }
+  if (__kmp_monitor_stksize < __kmp_sys_min_stksize) {
+    __kmp_monitor_stksize = __kmp_sys_min_stksize;
+  }
+
+  KA_TRACE(10, ("__kmp_create_monitor: default stacksize = %lu bytes,"
+                "requested stacksize = %lu bytes\n",
+                size, __kmp_monitor_stksize));
+
+retry:
+
+/* Set stack size for this thread now. */
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+  KA_TRACE(10, ("__kmp_create_monitor: setting stacksize = %lu bytes,",
+                __kmp_monitor_stksize));
+  status = pthread_attr_setstacksize(&thread_attr, __kmp_monitor_stksize);
+  if (status != 0) {
+    if (auto_adj_size) {
+      __kmp_monitor_stksize *= 2;
+      goto retry;
+    }
+    kmp_msg_t err_code = KMP_ERR(status);
+    __kmp_msg(kmp_ms_warning, // should this be fatal?  BB
+              KMP_MSG(CantSetMonitorStackSize, (long int)__kmp_monitor_stksize),
+              err_code, KMP_HNT(ChangeMonitorStackSize), __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+
+  status =
+      pthread_create(&handle, &thread_attr, __kmp_launch_monitor, (void *)th);
+
+  if (status != 0) {
+#ifdef _POSIX_THREAD_ATTR_STACKSIZE
+    if (status == EINVAL) {
+      if (auto_adj_size && (__kmp_monitor_stksize < (size_t)0x40000000)) {
+        __kmp_monitor_stksize *= 2;
+        goto retry;
+      }
+      __kmp_fatal(KMP_MSG(CantSetMonitorStackSize, __kmp_monitor_stksize),
+                  KMP_ERR(status), KMP_HNT(IncreaseMonitorStackSize),
+                  __kmp_msg_null);
+    }
+    if (status == ENOMEM) {
+      __kmp_fatal(KMP_MSG(CantSetMonitorStackSize, __kmp_monitor_stksize),
+                  KMP_ERR(status), KMP_HNT(DecreaseMonitorStackSize),
+                  __kmp_msg_null);
+    }
+#endif /* _POSIX_THREAD_ATTR_STACKSIZE */
+    if (status == EAGAIN) {
+      __kmp_fatal(KMP_MSG(NoResourcesForMonitorThread), KMP_ERR(status),
+                  KMP_HNT(DecreaseNumberOfThreadsInUse), __kmp_msg_null);
+    }
+    KMP_SYSFAIL("pthread_create", status);
+  }
+
+  th->th.th_info.ds.ds_thread = handle;
+
+#if KMP_REAL_TIME_FIX
+  // Wait for the monitor thread is really started and set its *priority*.
+  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) ==
+                   sizeof(__kmp_global.g.g_time.dt.t_value));
+  __kmp_wait_4((kmp_uint32 volatile *)&__kmp_global.g.g_time.dt.t_value, -1,
+               &__kmp_neq_4, NULL);
+#endif // KMP_REAL_TIME_FIX
+
+#ifdef KMP_THREAD_ATTR
+  status = pthread_attr_destroy(&thread_attr);
+  if (status != 0) {
+    kmp_msg_t err_code = KMP_ERR(status);
+    __kmp_msg(kmp_ms_warning, KMP_MSG(CantDestroyThreadAttrs), err_code,
+              __kmp_msg_null);
+    if (__kmp_generate_warnings == kmp_warnings_off) {
+      __kmp_str_free(&err_code.str);
+    }
+  }
+#endif
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(10, ("__kmp_create_monitor: monitor created %#.8lx\n",
+                th->th.th_info.ds.ds_thread));
+
+} // __kmp_create_monitor
+#endif // KMP_USE_MONITOR
+
+void __kmp_exit_thread(int exit_status) {
+  pthread_exit((void *)(intptr_t)exit_status);
+} // __kmp_exit_thread
+
+#if KMP_USE_MONITOR
+void __kmp_resume_monitor();
+
+void __kmp_reap_monitor(kmp_info_t *th) {
+  int status;
+  void *exit_val;
+
+  KA_TRACE(10, ("__kmp_reap_monitor: try to reap monitor thread with handle"
+                " %#.8lx\n",
+                th->th.th_info.ds.ds_thread));
+
+  // If monitor has been created, its tid and gtid should be KMP_GTID_MONITOR.
+  // If both tid and gtid are 0, it means the monitor did not ever start.
+  // If both tid and gtid are KMP_GTID_DNE, the monitor has been shut down.
+  KMP_DEBUG_ASSERT(th->th.th_info.ds.ds_tid == th->th.th_info.ds.ds_gtid);
+  if (th->th.th_info.ds.ds_gtid != KMP_GTID_MONITOR) {
+    KA_TRACE(10, ("__kmp_reap_monitor: monitor did not start, returning\n"));
+    return;
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  /* First, check to see whether the monitor thread exists to wake it up. This
+     is to avoid performance problem when the monitor sleeps during
+     blocktime-size interval */
+
+  status = pthread_kill(th->th.th_info.ds.ds_thread, 0);
+  if (status != ESRCH) {
+    __kmp_resume_monitor(); // Wake up the monitor thread
+  }
+  KA_TRACE(10, ("__kmp_reap_monitor: try to join with monitor\n"));
+  status = pthread_join(th->th.th_info.ds.ds_thread, &exit_val);
+  if (exit_val != th) {
+    __kmp_fatal(KMP_MSG(ReapMonitorError), KMP_ERR(status), __kmp_msg_null);
+  }
+
+  th->th.th_info.ds.ds_tid = KMP_GTID_DNE;
+  th->th.th_info.ds.ds_gtid = KMP_GTID_DNE;
+
+  KA_TRACE(10, ("__kmp_reap_monitor: done reaping monitor thread with handle"
+                " %#.8lx\n",
+                th->th.th_info.ds.ds_thread));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+#endif // KMP_USE_MONITOR
+
+void __kmp_reap_worker(kmp_info_t *th) {
+  int status;
+  void *exit_val;
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(
+      10, ("__kmp_reap_worker: try to reap T#%d\n", th->th.th_info.ds.ds_gtid));
+
+  status = pthread_join(th->th.th_info.ds.ds_thread, &exit_val);
+#ifdef KMP_DEBUG
+  /* Don't expose these to the user until we understand when they trigger */
+  if (status != 0) {
+    __kmp_fatal(KMP_MSG(ReapWorkerError), KMP_ERR(status), __kmp_msg_null);
+  }
+  if (exit_val != th) {
+    KA_TRACE(10, ("__kmp_reap_worker: worker T#%d did not reap properly, "
+                  "exit_val = %p\n",
+                  th->th.th_info.ds.ds_gtid, exit_val));
+  }
+#endif /* KMP_DEBUG */
+
+  KA_TRACE(10, ("__kmp_reap_worker: done reaping T#%d\n",
+                th->th.th_info.ds.ds_gtid));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+
+#if KMP_HANDLE_SIGNALS
+
+static void __kmp_null_handler(int signo) {
+  //  Do nothing, for doing SIG_IGN-type actions.
+} // __kmp_null_handler
+
+static void __kmp_team_handler(int signo) {
+  if (__kmp_global.g.g_abort == 0) {
+/* Stage 1 signal handler, let's shut down all of the threads */
+#ifdef KMP_DEBUG
+    __kmp_debug_printf("__kmp_team_handler: caught signal = %d\n", signo);
+#endif
+    switch (signo) {
+    case SIGHUP:
+    case SIGINT:
+    case SIGQUIT:
+    case SIGILL:
+    case SIGABRT:
+    case SIGFPE:
+    case SIGBUS:
+    case SIGSEGV:
+#ifdef SIGSYS
+    case SIGSYS:
+#endif
+    case SIGTERM:
+      if (__kmp_debug_buf) {
+        __kmp_dump_debug_buffer();
+      }
+      KMP_MB(); // Flush all pending memory write invalidates.
+      TCW_4(__kmp_global.g.g_abort, signo);
+      KMP_MB(); // Flush all pending memory write invalidates.
+      TCW_4(__kmp_global.g.g_done, TRUE);
+      KMP_MB(); // Flush all pending memory write invalidates.
+      break;
+    default:
+#ifdef KMP_DEBUG
+      __kmp_debug_printf("__kmp_team_handler: unknown signal type");
+#endif
+      break;
+    }
+  }
+} // __kmp_team_handler
+
+static void __kmp_sigaction(int signum, const struct sigaction *act,
+                            struct sigaction *oldact) {
+  int rc = sigaction(signum, act, oldact);
+  KMP_CHECK_SYSFAIL_ERRNO("sigaction", rc);
+}
+
+static void __kmp_install_one_handler(int sig, sig_func_t handler_func,
+                                      int parallel_init) {
+  KMP_MB(); // Flush all pending memory write invalidates.
+  KB_TRACE(60,
+           ("__kmp_install_one_handler( %d, ..., %d )\n", sig, parallel_init));
+  if (parallel_init) {
+    struct sigaction new_action;
+    struct sigaction old_action;
+    new_action.sa_handler = handler_func;
+    new_action.sa_flags = 0;
+    sigfillset(&new_action.sa_mask);
+    __kmp_sigaction(sig, &new_action, &old_action);
+    if (old_action.sa_handler == __kmp_sighldrs[sig].sa_handler) {
+      sigaddset(&__kmp_sigset, sig);
+    } else {
+      // Restore/keep user's handler if one previously installed.
+      __kmp_sigaction(sig, &old_action, NULL);
+    }
+  } else {
+    // Save initial/system signal handlers to see if user handlers installed.
+    __kmp_sigaction(sig, NULL, &__kmp_sighldrs[sig]);
+  }
+  KMP_MB(); // Flush all pending memory write invalidates.
+} // __kmp_install_one_handler
+
+static void __kmp_remove_one_handler(int sig) {
+  KB_TRACE(60, ("__kmp_remove_one_handler( %d )\n", sig));
+  if (sigismember(&__kmp_sigset, sig)) {
+    struct sigaction old;
+    KMP_MB(); // Flush all pending memory write invalidates.
+    __kmp_sigaction(sig, &__kmp_sighldrs[sig], &old);
+    if ((old.sa_handler != __kmp_team_handler) &&
+        (old.sa_handler != __kmp_null_handler)) {
+      // Restore the users signal handler.
+      KB_TRACE(10, ("__kmp_remove_one_handler: oops, not our handler, "
+                    "restoring: sig=%d\n",
+                    sig));
+      __kmp_sigaction(sig, &old, NULL);
+    }
+    sigdelset(&__kmp_sigset, sig);
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+} // __kmp_remove_one_handler
+
+void __kmp_install_signals(int parallel_init) {
+  KB_TRACE(10, ("__kmp_install_signals( %d )\n", parallel_init));
+  if (__kmp_handle_signals || !parallel_init) {
+    // If ! parallel_init, we do not install handlers, just save original
+    // handlers. Let us do it even __handle_signals is 0.
+    sigemptyset(&__kmp_sigset);
+    __kmp_install_one_handler(SIGHUP, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGINT, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGQUIT, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGILL, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGABRT, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGFPE, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGBUS, __kmp_team_handler, parallel_init);
+    __kmp_install_one_handler(SIGSEGV, __kmp_team_handler, parallel_init);
+#ifdef SIGSYS
+    __kmp_install_one_handler(SIGSYS, __kmp_team_handler, parallel_init);
+#endif // SIGSYS
+    __kmp_install_one_handler(SIGTERM, __kmp_team_handler, parallel_init);
+#ifdef SIGPIPE
+    __kmp_install_one_handler(SIGPIPE, __kmp_team_handler, parallel_init);
+#endif // SIGPIPE
+  }
+} // __kmp_install_signals
+
+void __kmp_remove_signals(void) {
+  int sig;
+  KB_TRACE(10, ("__kmp_remove_signals()\n"));
+  for (sig = 1; sig < NSIG; ++sig) {
+    __kmp_remove_one_handler(sig);
+  }
+} // __kmp_remove_signals
+
+#endif // KMP_HANDLE_SIGNALS
+
+void __kmp_enable(int new_state) {
+#ifdef KMP_CANCEL_THREADS
+  int status, old_state;
+  status = pthread_setcancelstate(new_state, &old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+  KMP_DEBUG_ASSERT(old_state == PTHREAD_CANCEL_DISABLE);
+#endif
+}
+
+void __kmp_disable(int *old_state) {
+#ifdef KMP_CANCEL_THREADS
+  int status;
+  status = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, old_state);
+  KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
+#endif
+}
+
+static void __kmp_atfork_prepare(void) {
+  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
+  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
+}
+
+static void __kmp_atfork_parent(void) {
+  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+}
+
+/* Reset the library so execution in the child starts "all over again" with
+   clean data structures in initial states.  Don't worry about freeing memory
+   allocated by parent, just abandon it to be safe. */
+static void __kmp_atfork_child(void) {
+  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
+  /* TODO make sure this is done right for nested/sibling */
+  // ATT:  Memory leaks are here? TODO: Check it and fix.
+  /* KMP_ASSERT( 0 ); */
+
+  ++__kmp_fork_count;
+
+#if KMP_AFFINITY_SUPPORTED
+#if KMP_OS_LINUX
+  // reset the affinity in the child to the initial thread
+  // affinity in the parent
+  kmp_set_thread_affinity_mask_initial();
+#endif
+  // Set default not to bind threads tightly in the child (we’re expecting
+  // over-subscription after the fork and this can improve things for
+  // scripting languages that use OpenMP inside process-parallel code).
+  __kmp_affinity_type = affinity_none;
+  if (__kmp_nested_proc_bind.bind_types != NULL) {
+    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+
+  __kmp_init_runtime = FALSE;
+#if KMP_USE_MONITOR
+  __kmp_init_monitor = 0;
+#endif
+  __kmp_init_parallel = FALSE;
+  __kmp_init_middle = FALSE;
+  __kmp_init_serial = FALSE;
+  TCW_4(__kmp_init_gtid, FALSE);
+  __kmp_init_common = FALSE;
+
+  TCW_4(__kmp_init_user_locks, FALSE);
+#if !KMP_USE_DYNAMIC_LOCK
+  __kmp_user_lock_table.used = 1;
+  __kmp_user_lock_table.allocated = 0;
+  __kmp_user_lock_table.table = NULL;
+  __kmp_lock_blocks = NULL;
+#endif
+
+  __kmp_all_nth = 0;
+  TCW_4(__kmp_nth, 0);
+
+  __kmp_thread_pool = NULL;
+  __kmp_thread_pool_insert_pt = NULL;
+  __kmp_team_pool = NULL;
+
+  /* Must actually zero all the *cache arguments passed to __kmpc_threadprivate
+     here so threadprivate doesn't use stale data */
+  KA_TRACE(10, ("__kmp_atfork_child: checking cache address list %p\n",
+                __kmp_threadpriv_cache_list));
+
+  while (__kmp_threadpriv_cache_list != NULL) {
+
+    if (*__kmp_threadpriv_cache_list->addr != NULL) {
+      KC_TRACE(50, ("__kmp_atfork_child: zeroing cache at address %p\n",
+                    &(*__kmp_threadpriv_cache_list->addr)));
+
+      *__kmp_threadpriv_cache_list->addr = NULL;
+    }
+    __kmp_threadpriv_cache_list = __kmp_threadpriv_cache_list->next;
+  }
+
+  __kmp_init_runtime = FALSE;
+
+  /* reset statically initialized locks */
+  __kmp_init_bootstrap_lock(&__kmp_initz_lock);
+  __kmp_init_bootstrap_lock(&__kmp_stdio_lock);
+  __kmp_init_bootstrap_lock(&__kmp_console_lock);
+  __kmp_init_bootstrap_lock(&__kmp_task_team_lock);
+
+#if USE_ITT_BUILD
+  __kmp_itt_reset(); // reset ITT's global state
+#endif /* USE_ITT_BUILD */
+
+  /* This is necessary to make sure no stale data is left around */
+  /* AC: customers complain that we use unsafe routines in the atfork
+     handler. Mathworks: dlsym() is unsafe. We call dlsym and dlopen
+     in dynamic_link when check the presence of shared tbbmalloc library.
+     Suggestion is to make the library initialization lazier, similar
+     to what done for __kmpc_begin(). */
+  // TODO: synchronize all static initializations with regular library
+  //       startup; look at kmp_global.cpp and etc.
+  //__kmp_internal_begin ();
+}
+
+void __kmp_register_atfork(void) {
+  if (__kmp_need_register_atfork) {
+    int status = pthread_atfork(__kmp_atfork_prepare, __kmp_atfork_parent,
+                                __kmp_atfork_child);
+    KMP_CHECK_SYSFAIL("pthread_atfork", status);
+    __kmp_need_register_atfork = FALSE;
+  }
+}
+
+void __kmp_suspend_initialize(void) {
+  int status;
+  status = pthread_mutexattr_init(&__kmp_suspend_mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
+  status = pthread_condattr_init(&__kmp_suspend_cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
+}
+
+void __kmp_suspend_initialize_thread(kmp_info_t *th) {
+  ANNOTATE_HAPPENS_AFTER(&th->th.th_suspend_init_count);
+  int old_value = KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count);
+  int new_value = __kmp_fork_count + 1;
+  // Return if already initialized
+  if (old_value == new_value)
+    return;
+  // Wait, then return if being initialized
+  if (old_value == -1 ||
+      !__kmp_atomic_compare_store(&th->th.th_suspend_init_count, old_value,
+                                  -1)) {
+    while (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) != new_value) {
+      KMP_CPU_PAUSE();
+    }
+  } else {
+    // Claim to be the initializer and do initializations
+    int status;
+    status = pthread_cond_init(&th->th.th_suspend_cv.c_cond,
+                               &__kmp_suspend_cond_attr);
+    KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+    status = pthread_mutex_init(&th->th.th_suspend_mx.m_mutex,
+                                &__kmp_suspend_mutex_attr);
+    KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+    KMP_ATOMIC_ST_REL(&th->th.th_suspend_init_count, new_value);
+    ANNOTATE_HAPPENS_BEFORE(&th->th.th_suspend_init_count);
+  }
+}
+
+void __kmp_suspend_uninitialize_thread(kmp_info_t *th) {
+  if (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) > __kmp_fork_count) {
+    /* this means we have initialize the suspension pthread objects for this
+       thread in this instance of the process */
+    int status;
+
+    status = pthread_cond_destroy(&th->th.th_suspend_cv.c_cond);
+    if (status != 0 && status != EBUSY) {
+      KMP_SYSFAIL("pthread_cond_destroy", status);
+    }
+    status = pthread_mutex_destroy(&th->th.th_suspend_mx.m_mutex);
+    if (status != 0 && status != EBUSY) {
+      KMP_SYSFAIL("pthread_mutex_destroy", status);
+    }
+    --th->th.th_suspend_init_count;
+    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count) ==
+                     __kmp_fork_count);
+  }
+}
+
+// return true if lock obtained, false otherwise
+int __kmp_try_suspend_mx(kmp_info_t *th) {
+  return (pthread_mutex_trylock(&th->th.th_suspend_mx.m_mutex) == 0);
+}
+
+void __kmp_lock_suspend_mx(kmp_info_t *th) {
+  int status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+}
+
+void __kmp_unlock_suspend_mx(kmp_info_t *th) {
+  int status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+}
+
+/* This routine puts the calling thread to sleep after setting the
+   sleep bit for the indicated flag variable to true. */
+template <class C>
+static inline void __kmp_suspend_template(int th_gtid, C *flag) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_suspend);
+  kmp_info_t *th = __kmp_threads[th_gtid];
+  int status;
+  typename C::flag_t old_spin;
+
+  KF_TRACE(30, ("__kmp_suspend_template: T#%d enter for flag = %p\n", th_gtid,
+                flag->get()));
+
+  __kmp_suspend_initialize_thread(th);
+
+  status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n",
+                th_gtid, flag->get()));
+
+  /* TODO: shouldn't this use release semantics to ensure that
+     __kmp_suspend_initialize_thread gets called first? */
+  old_spin = flag->set_sleeping();
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+      __kmp_pause_status != kmp_soft_paused) {
+    flag->unset_sleeping();
+    status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
+    KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+    return;
+  }
+  KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x,"
+               " was %x\n",
+               th_gtid, flag->get(), flag->load(), old_spin));
+
+  if (flag->done_check_val(old_spin)) {
+    old_spin = flag->unset_sleeping();
+    KF_TRACE(5, ("__kmp_suspend_template: T#%d false alarm, reset sleep bit "
+                 "for spin(%p)\n",
+                 th_gtid, flag->get()));
+  } else {
+    /* Encapsulate in a loop as the documentation states that this may
+       "with low probability" return when the condition variable has
+       not been signaled or broadcast */
+    int deactivated = FALSE;
+    TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+
+    while (flag->is_sleeping()) {
+#ifdef DEBUG_SUSPEND
+      char buffer[128];
+      __kmp_suspend_count++;
+      __kmp_print_cond(buffer, &th->th.th_suspend_cv);
+      __kmp_printf("__kmp_suspend_template: suspending T#%d: %s\n", th_gtid,
+                   buffer);
+#endif
+      // Mark the thread as no longer active (only in the first iteration of the
+      // loop).
+      if (!deactivated) {
+        th->th.th_active = FALSE;
+        if (th->th.th_active_in_pool) {
+          th->th.th_active_in_pool = FALSE;
+          KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+          KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+        }
+        deactivated = TRUE;
+      }
+
+#if USE_SUSPEND_TIMEOUT
+      struct timespec now;
+      struct timeval tval;
+      int msecs;
+
+      status = gettimeofday(&tval, NULL);
+      KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+      TIMEVAL_TO_TIMESPEC(&tval, &now);
+
+      msecs = (4 * __kmp_dflt_blocktime) + 200;
+      now.tv_sec += msecs / 1000;
+      now.tv_nsec += (msecs % 1000) * 1000;
+
+      KF_TRACE(15, ("__kmp_suspend_template: T#%d about to perform "
+                    "pthread_cond_timedwait\n",
+                    th_gtid));
+      status = pthread_cond_timedwait(&th->th.th_suspend_cv.c_cond,
+                                      &th->th.th_suspend_mx.m_mutex, &now);
+#else
+      KF_TRACE(15, ("__kmp_suspend_template: T#%d about to perform"
+                    " pthread_cond_wait\n",
+                    th_gtid));
+      status = pthread_cond_wait(&th->th.th_suspend_cv.c_cond,
+                                 &th->th.th_suspend_mx.m_mutex);
+#endif
+
+      if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) {
+        KMP_SYSFAIL("pthread_cond_wait", status);
+      }
+#ifdef KMP_DEBUG
+      if (status == ETIMEDOUT) {
+        if (flag->is_sleeping()) {
+          KF_TRACE(100,
+                   ("__kmp_suspend_template: T#%d timeout wakeup\n", th_gtid));
+        } else {
+          KF_TRACE(2, ("__kmp_suspend_template: T#%d timeout wakeup, sleep bit "
+                       "not set!\n",
+                       th_gtid));
+        }
+      } else if (flag->is_sleeping()) {
+        KF_TRACE(100,
+                 ("__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid));
+      }
+#endif
+    } // while
+
+    // Mark the thread as active again (if it was previous marked as inactive)
+    if (deactivated) {
+      th->th.th_active = TRUE;
+      if (TCR_4(th->th.th_in_pool)) {
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
+        th->th.th_active_in_pool = TRUE;
+      }
+    }
+  }
+#ifdef DEBUG_SUSPEND
+  {
+    char buffer[128];
+    __kmp_print_cond(buffer, &th->th.th_suspend_cv);
+    __kmp_printf("__kmp_suspend_template: T#%d has awakened: %s\n", th_gtid,
+                 buffer);
+  }
+#endif
+
+  status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+  KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid));
+}
+
+void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+
+/* This routine signals the thread specified by target_gtid to wake up
+   after setting the sleep bit indicated by the flag argument to FALSE.
+   The target thread must already have called __kmp_suspend_template() */
+template <class C>
+static inline void __kmp_resume_template(int target_gtid, C *flag) {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume);
+  kmp_info_t *th = __kmp_threads[target_gtid];
+  int status;
+
+#ifdef KMP_DEBUG
+  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+
+  KF_TRACE(30, ("__kmp_resume_template: T#%d wants to wakeup T#%d enter\n",
+                gtid, target_gtid));
+  KMP_DEBUG_ASSERT(gtid != target_gtid);
+
+  __kmp_suspend_initialize_thread(th);
+
+  status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+
+  if (!flag) { // coming from __kmp_null_resume_wrapper
+    flag = (C *)CCAST(void *, th->th.th_sleep_loc);
+  }
+
+  // First, check if the flag is null or its type has changed. If so, someone
+  // else woke it up.
+  if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type
+    // simply shows what
+    // flag was cast to
+    KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
+                 "awake: flag(%p)\n",
+                 gtid, target_gtid, NULL));
+    status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
+    KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+    return;
+  } else { // if multiple threads are sleeping, flag should be internally
+    // referring to a specific thread here
+    typename C::flag_t old_spin = flag->unset_sleeping();
+    if (!flag->is_sleeping_val(old_spin)) {
+      KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
+                   "awake: flag(%p): "
+                   "%u => %u\n",
+                   gtid, target_gtid, flag->get(), old_spin, flag->load()));
+      status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
+      KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+      return;
+    }
+    KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
+                 "sleep bit for flag's loc(%p): "
+                 "%u => %u\n",
+                 gtid, target_gtid, flag->get(), old_spin, flag->load()));
+  }
+  TCW_PTR(th->th.th_sleep_loc, NULL);
+
+#ifdef DEBUG_SUSPEND
+  {
+    char buffer[128];
+    __kmp_print_cond(buffer, &th->th.th_suspend_cv);
+    __kmp_printf("__kmp_resume_template: T#%d resuming T#%d: %s\n", gtid,
+                 target_gtid, buffer);
+  }
+#endif
+  status = pthread_cond_signal(&th->th.th_suspend_cv.c_cond);
+  KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
+  status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+  KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up"
+                " for T#%d\n",
+                gtid, target_gtid));
+}
+
+void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+
+#if KMP_USE_MONITOR
+void __kmp_resume_monitor() {
+  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume);
+  int status;
+#ifdef KMP_DEBUG
+  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+  KF_TRACE(30, ("__kmp_resume_monitor: T#%d wants to wakeup T#%d enter\n", gtid,
+                KMP_GTID_MONITOR));
+  KMP_DEBUG_ASSERT(gtid != KMP_GTID_MONITOR);
+#endif
+  status = pthread_mutex_lock(&__kmp_wait_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
+#ifdef DEBUG_SUSPEND
+  {
+    char buffer[128];
+    __kmp_print_cond(buffer, &__kmp_wait_cv.c_cond);
+    __kmp_printf("__kmp_resume_monitor: T#%d resuming T#%d: %s\n", gtid,
+                 KMP_GTID_MONITOR, buffer);
+  }
+#endif
+  status = pthread_cond_signal(&__kmp_wait_cv.c_cond);
+  KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
+  status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
+  KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
+  KF_TRACE(30, ("__kmp_resume_monitor: T#%d exiting after signaling wake up"
+                " for T#%d\n",
+                gtid, KMP_GTID_MONITOR));
+}
+#endif // KMP_USE_MONITOR
+
+void __kmp_yield() { sched_yield(); }
+
+void __kmp_gtid_set_specific(int gtid) {
+  if (__kmp_init_gtid) {
+    int status;
+    status = pthread_setspecific(__kmp_gtid_threadprivate_key,
+                                 (void *)(intptr_t)(gtid + 1));
+    KMP_CHECK_SYSFAIL("pthread_setspecific", status);
+  } else {
+    KA_TRACE(50, ("__kmp_gtid_set_specific: runtime shutdown, returning\n"));
+  }
+}
+
+int __kmp_gtid_get_specific() {
+  int gtid;
+  if (!__kmp_init_gtid) {
+    KA_TRACE(50, ("__kmp_gtid_get_specific: runtime shutdown, returning "
+                  "KMP_GTID_SHUTDOWN\n"));
+    return KMP_GTID_SHUTDOWN;
+  }
+  gtid = (int)(size_t)pthread_getspecific(__kmp_gtid_threadprivate_key);
+  if (gtid == 0) {
+    gtid = KMP_GTID_DNE;
+  } else {
+    gtid--;
+  }
+  KA_TRACE(50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
+                __kmp_gtid_threadprivate_key, gtid));
+  return gtid;
+}
+
+double __kmp_read_cpu_time(void) {
+  /*clock_t   t;*/
+  struct tms buffer;
+
+  /*t =*/times(&buffer);
+
+  return (buffer.tms_utime + buffer.tms_cutime) / (double)CLOCKS_PER_SEC;
+}
+
+int __kmp_read_system_info(struct kmp_sys_info *info) {
+  int status;
+  struct rusage r_usage;
+
+  memset(info, 0, sizeof(*info));
+
+  status = getrusage(RUSAGE_SELF, &r_usage);
+  KMP_CHECK_SYSFAIL_ERRNO("getrusage", status);
+
+  // The maximum resident set size utilized (in kilobytes)
+  info->maxrss = r_usage.ru_maxrss;
+  // The number of page faults serviced without any I/O
+  info->minflt = r_usage.ru_minflt;
+  // The number of page faults serviced that required I/O
+  info->majflt = r_usage.ru_majflt;
+  // The number of times a process was "swapped" out of memory
+  info->nswap = r_usage.ru_nswap;
+  // The number of times the file system had to perform input
+  info->inblock = r_usage.ru_inblock;
+  // The number of times the file system had to perform output
+  info->oublock = r_usage.ru_oublock;
+  // The number of times a context switch was voluntarily
+  info->nvcsw = r_usage.ru_nvcsw;
+  // The number of times a context switch was forced
+  info->nivcsw = r_usage.ru_nivcsw;
+
+  return (status != 0);
+}
+
+void __kmp_read_system_time(double *delta) {
+  double t_ns;
+  struct timeval tval;
+  struct timespec stop;
+  int status;
+
+  status = gettimeofday(&tval, NULL);
+  KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+  TIMEVAL_TO_TIMESPEC(&tval, &stop);
+  t_ns = TS2NS(stop) - TS2NS(__kmp_sys_timer_data.start);
+  *delta = (t_ns * 1e-9);
+}
+
+void __kmp_clear_system_time(void) {
+  struct timeval tval;
+  int status;
+  status = gettimeofday(&tval, NULL);
+  KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+  TIMEVAL_TO_TIMESPEC(&tval, &__kmp_sys_timer_data.start);
+}
+
+static int __kmp_get_xproc(void) {
+
+  int r = 0;
+
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+        KMP_OS_OPENBSD || KMP_OS_HURD
+
+  r = sysconf(_SC_NPROCESSORS_ONLN);
+
+#elif KMP_OS_DARWIN
+
+  // Bug C77011 High "OpenMP Threads and number of active cores".
+
+  // Find the number of available CPUs.
+  kern_return_t rc;
+  host_basic_info_data_t info;
+  mach_msg_type_number_t num = HOST_BASIC_INFO_COUNT;
+  rc = host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&info, &num);
+  if (rc == 0 && num == HOST_BASIC_INFO_COUNT) {
+    // Cannot use KA_TRACE() here because this code works before trace support
+    // is initialized.
+    r = info.avail_cpus;
+  } else {
+    KMP_WARNING(CantGetNumAvailCPU);
+    KMP_INFORM(AssumedNumCPU);
+  }
+
+#else
+
+#error "Unknown or unsupported OS."
+
+#endif
+
+  return r > 0 ? r : 2; /* guess value of 2 if OS told us 0 */
+
+} // __kmp_get_xproc
+
+int __kmp_read_from_file(char const *path, char const *format, ...) {
+  int result;
+  va_list args;
+
+  va_start(args, format);
+  FILE *f = fopen(path, "rb");
+  if (f == NULL)
+    return 0;
+  result = vfscanf(f, format, args);
+  fclose(f);
+
+  return result;
+}
+
+void __kmp_runtime_initialize(void) {
+  int status;
+  pthread_mutexattr_t mutex_attr;
+  pthread_condattr_t cond_attr;
+
+  if (__kmp_init_runtime) {
+    return;
+  }
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+  if (!__kmp_cpuinfo.initialized) {
+    __kmp_query_cpuid(&__kmp_cpuinfo);
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  __kmp_xproc = __kmp_get_xproc();
+
+#if ! KMP_32_BIT_ARCH
+  struct rlimit rlim;
+  // read stack size of calling thread, save it as default for worker threads;
+  // this should be done before reading environment variables
+  status = getrlimit(RLIMIT_STACK, &rlim);
+  if (status == 0) { // success?
+    __kmp_stksize = rlim.rlim_cur;
+    __kmp_check_stksize(&__kmp_stksize); // check value and adjust if needed
+  }
+#endif /* KMP_32_BIT_ARCH */
+
+  if (sysconf(_SC_THREADS)) {
+
+    /* Query the maximum number of threads */
+    __kmp_sys_max_nth = sysconf(_SC_THREAD_THREADS_MAX);
+    if (__kmp_sys_max_nth == -1) {
+      /* Unlimited threads for NPTL */
+      __kmp_sys_max_nth = INT_MAX;
+    } else if (__kmp_sys_max_nth <= 1) {
+      /* Can't tell, just use PTHREAD_THREADS_MAX */
+      __kmp_sys_max_nth = KMP_MAX_NTH;
+    }
+
+    /* Query the minimum stack size */
+    __kmp_sys_min_stksize = sysconf(_SC_THREAD_STACK_MIN);
+    if (__kmp_sys_min_stksize <= 1) {
+      __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
+    }
+  }
+
+  /* Set up minimum number of threads to switch to TLS gtid */
+  __kmp_tls_gtid_min = KMP_TLS_GTID_MIN;
+
+  status = pthread_key_create(&__kmp_gtid_threadprivate_key,
+                              __kmp_internal_end_dest);
+  KMP_CHECK_SYSFAIL("pthread_key_create", status);
+  status = pthread_mutexattr_init(&mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
+  status = pthread_mutex_init(&__kmp_wait_mx.m_mutex, &mutex_attr);
+  KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
+  status = pthread_condattr_init(&cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
+  status = pthread_cond_init(&__kmp_wait_cv.c_cond, &cond_attr);
+  KMP_CHECK_SYSFAIL("pthread_cond_init", status);
+#if USE_ITT_BUILD
+  __kmp_itt_initialize();
+#endif /* USE_ITT_BUILD */
+
+  __kmp_init_runtime = TRUE;
+}
+
+void __kmp_runtime_destroy(void) {
+  int status;
+
+  if (!__kmp_init_runtime) {
+    return; // Nothing to do.
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_destroy();
+#endif /* USE_ITT_BUILD */
+
+  status = pthread_key_delete(__kmp_gtid_threadprivate_key);
+  KMP_CHECK_SYSFAIL("pthread_key_delete", status);
+
+  status = pthread_mutex_destroy(&__kmp_wait_mx.m_mutex);
+  if (status != 0 && status != EBUSY) {
+    KMP_SYSFAIL("pthread_mutex_destroy", status);
+  }
+  status = pthread_cond_destroy(&__kmp_wait_cv.c_cond);
+  if (status != 0 && status != EBUSY) {
+    KMP_SYSFAIL("pthread_cond_destroy", status);
+  }
+#if KMP_AFFINITY_SUPPORTED
+  __kmp_affinity_uninitialize();
+#endif
+
+  __kmp_init_runtime = FALSE;
+}
+
+/* Put the thread to sleep for a time period */
+/* NOTE: not currently used anywhere */
+void __kmp_thread_sleep(int millis) { sleep((millis + 500) / 1000); }
+
+/* Calculate the elapsed wall clock time for the user */
+void __kmp_elapsed(double *t) {
+  int status;
+#ifdef FIX_SGI_CLOCK
+  struct timespec ts;
+
+  status = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+  KMP_CHECK_SYSFAIL_ERRNO("clock_gettime", status);
+  *t =
+      (double)ts.tv_nsec * (1.0 / (double)KMP_NSEC_PER_SEC) + (double)ts.tv_sec;
+#else
+  struct timeval tv;
+
+  status = gettimeofday(&tv, NULL);
+  KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
+  *t =
+      (double)tv.tv_usec * (1.0 / (double)KMP_USEC_PER_SEC) + (double)tv.tv_sec;
+#endif
+}
+
+/* Calculate the elapsed wall clock tick for the user */
+void __kmp_elapsed_tick(double *t) { *t = 1 / (double)CLOCKS_PER_SEC; }
+
+/* Return the current time stamp in nsec */
+kmp_uint64 __kmp_now_nsec() {
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  kmp_uint64 nsec = (kmp_uint64)KMP_NSEC_PER_SEC * (kmp_uint64)t.tv_sec +
+                    (kmp_uint64)1000 * (kmp_uint64)t.tv_usec;
+  return nsec;
+}
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+/* Measure clock ticks per millisecond */
+void __kmp_initialize_system_tick() {
+  kmp_uint64 now, nsec2, diff;
+  kmp_uint64 delay = 100000; // 50~100 usec on most machines.
+  kmp_uint64 nsec = __kmp_now_nsec();
+  kmp_uint64 goal = __kmp_hardware_timestamp() + delay;
+  while ((now = __kmp_hardware_timestamp()) < goal)
+    ;
+  nsec2 = __kmp_now_nsec();
+  diff = nsec2 - nsec;
+  if (diff > 0) {
+    kmp_uint64 tpms = (kmp_uint64)(1e6 * (delay + (now - goal)) / diff);
+    if (tpms > 0)
+      __kmp_ticks_per_msec = tpms;
+  }
+}
+#endif
+
+/* Determine whether the given address is mapped into the current address
+   space. */
+
+int __kmp_is_address_mapped(void *addr) {
+
+  int found = 0;
+  int rc;
+
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_HURD
+
+  /* On GNUish OSes, read the /proc/<pid>/maps pseudo-file to get all the address
+     ranges mapped into the address space. */
+
+  char *name = __kmp_str_format("/proc/%d/maps", getpid());
+  FILE *file = NULL;
+
+  file = fopen(name, "r");
+  KMP_ASSERT(file != NULL);
+
+  for (;;) {
+
+    void *beginning = NULL;
+    void *ending = NULL;
+    char perms[5];
+
+    rc = fscanf(file, "%p-%p %4s %*[^\n]\n", &beginning, &ending, perms);
+    if (rc == EOF) {
+      break;
+    }
+    KMP_ASSERT(rc == 3 &&
+               KMP_STRLEN(perms) == 4); // Make sure all fields are read.
+
+    // Ending address is not included in the region, but beginning is.
+    if ((addr >= beginning) && (addr < ending)) {
+      perms[2] = 0; // 3th and 4th character does not matter.
+      if (strcmp(perms, "rw") == 0) {
+        // Memory we are looking for should be readable and writable.
+        found = 1;
+      }
+      break;
+    }
+  }
+
+  // Free resources.
+  fclose(file);
+  KMP_INTERNAL_FREE(name);
+
+#elif KMP_OS_DARWIN
+
+  /* On OS X*, /proc pseudo filesystem is not available. Try to read memory
+     using vm interface. */
+
+  int buffer;
+  vm_size_t count;
+  rc = vm_read_overwrite(
+      mach_task_self(), // Task to read memory of.
+      (vm_address_t)(addr), // Address to read from.
+      1, // Number of bytes to be read.
+      (vm_address_t)(&buffer), // Address of buffer to save read bytes in.
+      &count // Address of var to save number of read bytes in.
+      );
+  if (rc == 0) {
+    // Memory successfully read.
+    found = 1;
+  }
+
+#elif KMP_OS_NETBSD
+
+  int mib[5];
+  mib[0] = CTL_VM;
+  mib[1] = VM_PROC;
+  mib[2] = VM_PROC_MAP;
+  mib[3] = getpid();
+  mib[4] = sizeof(struct kinfo_vmentry);
+
+  size_t size;
+  rc = sysctl(mib, __arraycount(mib), NULL, &size, NULL, 0);
+  KMP_ASSERT(!rc);
+  KMP_ASSERT(size);
+
+  size = size * 4 / 3;
+  struct kinfo_vmentry *kiv = (struct kinfo_vmentry *)KMP_INTERNAL_MALLOC(size);
+  KMP_ASSERT(kiv);
+
+  rc = sysctl(mib, __arraycount(mib), kiv, &size, NULL, 0);
+  KMP_ASSERT(!rc);
+  KMP_ASSERT(size);
+
+  for (size_t i = 0; i < size; i++) {
+    if (kiv[i].kve_start >= (uint64_t)addr &&
+        kiv[i].kve_end <= (uint64_t)addr) {
+      found = 1;
+      break;
+    }
+  }
+  KMP_INTERNAL_FREE(kiv);
+#elif KMP_OS_DRAGONFLY || KMP_OS_OPENBSD
+
+  // FIXME(DragonFly, OpenBSD): Implement this
+  found = 1;
+
+#else
+
+#error "Unknown or unsupported OS"
+
+#endif
+
+  return found;
+
+} // __kmp_is_address_mapped
+
+#ifdef USE_LOAD_BALANCE
+
+#if KMP_OS_DARWIN || KMP_OS_NETBSD
+
+// The function returns the rounded value of the system load average
+// during given time interval which depends on the value of
+// __kmp_load_balance_interval variable (default is 60 sec, other values
+// may be 300 sec or 900 sec).
+// It returns -1 in case of error.
+int __kmp_get_load_balance(int max) {
+  double averages[3];
+  int ret_avg = 0;
+
+  int res = getloadavg(averages, 3);
+
+  // Check __kmp_load_balance_interval to determine which of averages to use.
+  // getloadavg() may return the number of samples less than requested that is
+  // less than 3.
+  if (__kmp_load_balance_interval < 180 && (res >= 1)) {
+    ret_avg = averages[0]; // 1 min
+  } else if ((__kmp_load_balance_interval >= 180 &&
+              __kmp_load_balance_interval < 600) &&
+             (res >= 2)) {
+    ret_avg = averages[1]; // 5 min
+  } else if ((__kmp_load_balance_interval >= 600) && (res == 3)) {
+    ret_avg = averages[2]; // 15 min
+  } else { // Error occurred
+    return -1;
+  }
+
+  return ret_avg;
+}
+
+#else // Linux* OS
+
+// The fuction returns number of running (not sleeping) threads, or -1 in case
+// of error. Error could be reported if Linux* OS kernel too old (without
+// "/proc" support). Counting running threads stops if max running threads
+// encountered.
+int __kmp_get_load_balance(int max) {
+  static int permanent_error = 0;
+  static int glb_running_threads = 0; // Saved count of the running threads for
+  // the thread balance algortihm
+  static double glb_call_time = 0; /* Thread balance algorithm call time */
+
+  int running_threads = 0; // Number of running threads in the system.
+
+  DIR *proc_dir = NULL; // Handle of "/proc/" directory.
+  struct dirent *proc_entry = NULL;
+
+  kmp_str_buf_t task_path; // "/proc/<pid>/task/<tid>/" path.
+  DIR *task_dir = NULL; // Handle of "/proc/<pid>/task/<tid>/" directory.
+  struct dirent *task_entry = NULL;
+  int task_path_fixed_len;
+
+  kmp_str_buf_t stat_path; // "/proc/<pid>/task/<tid>/stat" path.
+  int stat_file = -1;
+  int stat_path_fixed_len;
+
+  int total_processes = 0; // Total number of processes in system.
+  int total_threads = 0; // Total number of threads in system.
+
+  double call_time = 0.0;
+
+  __kmp_str_buf_init(&task_path);
+  __kmp_str_buf_init(&stat_path);
+
+  __kmp_elapsed(&call_time);
+
+  if (glb_call_time &&
+      (call_time - glb_call_time < __kmp_load_balance_interval)) {
+    running_threads = glb_running_threads;
+    goto finish;
+  }
+
+  glb_call_time = call_time;
+
+  // Do not spend time on scanning "/proc/" if we have a permanent error.
+  if (permanent_error) {
+    running_threads = -1;
+    goto finish;
+  }
+
+  if (max <= 0) {
+    max = INT_MAX;
+  }
+
+  // Open "/proc/" directory.
+  proc_dir = opendir("/proc");
+  if (proc_dir == NULL) {
+    // Cannot open "/prroc/". Probably the kernel does not support it. Return an
+    // error now and in subsequent calls.
+    running_threads = -1;
+    permanent_error = 1;
+    goto finish;
+  }
+
+  // Initialize fixed part of task_path. This part will not change.
+  __kmp_str_buf_cat(&task_path, "/proc/", 6);
+  task_path_fixed_len = task_path.used; // Remember number of used characters.
+
+  proc_entry = readdir(proc_dir);
+  while (proc_entry != NULL) {
+    // Proc entry is a directory and name starts with a digit. Assume it is a
+    // process' directory.
+    if (proc_entry->d_type == DT_DIR && isdigit(proc_entry->d_name[0])) {
+
+      ++total_processes;
+      // Make sure init process is the very first in "/proc", so we can replace
+      // strcmp( proc_entry->d_name, "1" ) == 0 with simpler total_processes ==
+      // 1. We are going to check that total_processes == 1 => d_name == "1" is
+      // true (where "=>" is implication). Since C++ does not have => operator,
+      // let us replace it with its equivalent: a => b == ! a || b.
+      KMP_DEBUG_ASSERT(total_processes != 1 ||
+                       strcmp(proc_entry->d_name, "1") == 0);
+
+      // Construct task_path.
+      task_path.used = task_path_fixed_len; // Reset task_path to "/proc/".
+      __kmp_str_buf_cat(&task_path, proc_entry->d_name,
+                        KMP_STRLEN(proc_entry->d_name));
+      __kmp_str_buf_cat(&task_path, "/task", 5);
+
+      task_dir = opendir(task_path.str);
+      if (task_dir == NULL) {
+        // Process can finish between reading "/proc/" directory entry and
+        // opening process' "task/" directory. So, in general case we should not
+        // complain, but have to skip this process and read the next one. But on
+        // systems with no "task/" support we will spend lot of time to scan
+        // "/proc/" tree again and again without any benefit. "init" process
+        // (its pid is 1) should exist always, so, if we cannot open
+        // "/proc/1/task/" directory, it means "task/" is not supported by
+        // kernel. Report an error now and in the future.
+        if (strcmp(proc_entry->d_name, "1") == 0) {
+          running_threads = -1;
+          permanent_error = 1;
+          goto finish;
+        }
+      } else {
+        // Construct fixed part of stat file path.
+        __kmp_str_buf_clear(&stat_path);
+        __kmp_str_buf_cat(&stat_path, task_path.str, task_path.used);
+        __kmp_str_buf_cat(&stat_path, "/", 1);
+        stat_path_fixed_len = stat_path.used;
+
+        task_entry = readdir(task_dir);
+        while (task_entry != NULL) {
+          // It is a directory and name starts with a digit.
+          if (proc_entry->d_type == DT_DIR && isdigit(task_entry->d_name[0])) {
+            ++total_threads;
+
+            // Consruct complete stat file path. Easiest way would be:
+            //  __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str,
+            //  task_entry->d_name );
+            // but seriae of __kmp_str_buf_cat works a bit faster.
+            stat_path.used =
+                stat_path_fixed_len; // Reset stat path to its fixed part.
+            __kmp_str_buf_cat(&stat_path, task_entry->d_name,
+                              KMP_STRLEN(task_entry->d_name));
+            __kmp_str_buf_cat(&stat_path, "/stat", 5);
+
+            // Note: Low-level API (open/read/close) is used. High-level API
+            // (fopen/fclose)  works ~ 30 % slower.
+            stat_file = open(stat_path.str, O_RDONLY);
+            if (stat_file == -1) {
+              // We cannot report an error because task (thread) can terminate
+              // just before reading this file.
+            } else {
+              /* Content of "stat" file looks like:
+                 24285 (program) S ...
+
+                 It is a single line (if program name does not include funny
+                 symbols). First number is a thread id, then name of executable
+                 file name in paretheses, then state of the thread. We need just
+                 thread state.
+
+                 Good news: Length of program name is 15 characters max. Longer
+                 names are truncated.
+
+                 Thus, we need rather short buffer: 15 chars for program name +
+                 2 parenthesis, + 3 spaces + ~7 digits of pid = 37.
+
+                 Bad news: Program name may contain special symbols like space,
+                 closing parenthesis, or even new line. This makes parsing
+                 "stat" file not 100 % reliable. In case of fanny program names
+                 parsing may fail (report incorrect thread state).
+
+                 Parsing "status" file looks more promissing (due to different
+                 file structure and escaping special symbols) but reading and
+                 parsing of "status" file works slower.
+                  -- ln
+              */
+              char buffer[65];
+              int len;
+              len = read(stat_file, buffer, sizeof(buffer) - 1);
+              if (len >= 0) {
+                buffer[len] = 0;
+                // Using scanf:
+                //     sscanf( buffer, "%*d (%*s) %c ", & state );
+                // looks very nice, but searching for a closing parenthesis
+                // works a bit faster.
+                char *close_parent = strstr(buffer, ") ");
+                if (close_parent != NULL) {
+                  char state = *(close_parent + 2);
+                  if (state == 'R') {
+                    ++running_threads;
+                    if (running_threads >= max) {
+                      goto finish;
+                    }
+                  }
+                }
+              }
+              close(stat_file);
+              stat_file = -1;
+            }
+          }
+          task_entry = readdir(task_dir);
+        }
+        closedir(task_dir);
+        task_dir = NULL;
+      }
+    }
+    proc_entry = readdir(proc_dir);
+  }
+
+  // There _might_ be a timing hole where the thread executing this
+  // code get skipped in the load balance, and running_threads is 0.
+  // Assert in the debug builds only!!!
+  KMP_DEBUG_ASSERT(running_threads > 0);
+  if (running_threads <= 0) {
+    running_threads = 1;
+  }
+
+finish: // Clean up and exit.
+  if (proc_dir != NULL) {
+    closedir(proc_dir);
+  }
+  __kmp_str_buf_free(&task_path);
+  if (task_dir != NULL) {
+    closedir(task_dir);
+  }
+  __kmp_str_buf_free(&stat_path);
+  if (stat_file != -1) {
+    close(stat_file);
+  }
+
+  glb_running_threads = running_threads;
+
+  return running_threads;
+
+} // __kmp_get_load_balance
+
+#endif // KMP_OS_DARWIN
+
+#endif // USE_LOAD_BALANCE
+
+#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
+      ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || KMP_ARCH_PPC64)
+
+// we really only need the case with 1 argument, because CLANG always build
+// a struct of pointers to shared variables referenced in the outlined function
+int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+                           void *p_argv[]
+#if OMPT_SUPPORT
+                           ,
+                           void **exit_frame_ptr
+#endif
+                           ) {
+#if OMPT_SUPPORT
+  *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+#endif
+
+  switch (argc) {
+  default:
+    fprintf(stderr, "Too many args to microtask: %d!\n", argc);
+    fflush(stderr);
+    exit(-1);
+  case 0:
+    (*pkfn)(&gtid, &tid);
+    break;
+  case 1:
+    (*pkfn)(&gtid, &tid, p_argv[0]);
+    break;
+  case 2:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+    break;
+  case 3:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+    break;
+  case 4:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+    break;
+  case 5:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+    break;
+  case 6:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5]);
+    break;
+  case 7:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6]);
+    break;
+  case 8:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7]);
+    break;
+  case 9:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
+    break;
+  case 10:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
+    break;
+  case 11:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
+    break;
+  case 12:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11]);
+    break;
+  case 13:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12]);
+    break;
+  case 14:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13]);
+    break;
+  case 15:
+    (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
+            p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+            p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
+    break;
+  }
+
+#if OMPT_SUPPORT
+  *exit_frame_ptr = 0;
+#endif
+
+  return 1;
+}
+
+#endif
+
+// end of file //
diff --git a/final/runtime/src/z_Windows_NT-586_asm.asm b/final/runtime/src/z_Windows_NT-586_asm.asm
new file mode 100644
index 0000000..7d0e32e
--- /dev/null
+++ b/final/runtime/src/z_Windows_NT-586_asm.asm
@@ -0,0 +1,1298 @@
+;  z_Windows_NT-586_asm.asm:  - microtasking routines specifically
+;    written for IA-32 architecture and Intel(R) 64 running Windows* OS
+
+;
+;//===----------------------------------------------------------------------===//
+;//
+;// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+;// See https://llvm.org/LICENSE.txt for license information.
+;// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;//
+;//===----------------------------------------------------------------------===//
+;
+
+        TITLE   z_Windows_NT-586_asm.asm
+
+; ============================= IA-32 architecture ==========================
+ifdef _M_IA32
+
+        .586P
+
+if @Version gt 510
+        .model HUGE
+else
+_TEXT   SEGMENT PARA USE32 PUBLIC 'CODE'
+_TEXT   ENDS
+_DATA   SEGMENT DWORD USE32 PUBLIC 'DATA'
+_DATA   ENDS
+CONST   SEGMENT DWORD USE32 PUBLIC 'CONST'
+CONST   ENDS
+_BSS    SEGMENT DWORD USE32 PUBLIC 'BSS'
+_BSS    ENDS
+$$SYMBOLS       SEGMENT BYTE USE32 'DEBSYM'
+$$SYMBOLS       ENDS
+$$TYPES SEGMENT BYTE USE32 'DEBTYP'
+$$TYPES ENDS
+_TLS    SEGMENT DWORD USE32 PUBLIC 'TLS'
+_TLS    ENDS
+FLAT    GROUP _DATA, CONST, _BSS
+        ASSUME  CS: FLAT, DS: FLAT, SS: FLAT
+endif
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_x86_pause
+;
+; void
+; __kmp_x86_pause( void )
+PUBLIC  ___kmp_x86_pause
+_p$ = 4
+_d$ = 8
+_TEXT   SEGMENT
+        ALIGN 16
+___kmp_x86_pause PROC NEAR
+
+        db      0f3H
+        db      090H    ;; pause
+        ret
+
+___kmp_x86_pause ENDP
+_TEXT   ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_x86_cpuid
+;
+; void
+; __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
+PUBLIC  ___kmp_x86_cpuid
+_TEXT   SEGMENT
+        ALIGN 16
+_mode$  = 8
+_mode2$ = 12
+_p$     = 16
+_eax$   = 0
+_ebx$   = 4
+_ecx$   = 8
+_edx$   = 12
+
+___kmp_x86_cpuid PROC NEAR
+
+        push      ebp
+        mov       ebp, esp
+
+        push      edi
+        push      ebx
+        push      ecx
+        push      edx
+
+        mov	  eax, DWORD PTR _mode$[ebp]
+        mov	  ecx, DWORD PTR _mode2$[ebp]
+	cpuid					; Query the CPUID for the current processor
+
+        mov       edi, DWORD PTR _p$[ebp]
+	mov 	  DWORD PTR _eax$[ edi ], eax
+	mov 	  DWORD PTR _ebx$[ edi ], ebx
+	mov 	  DWORD PTR _ecx$[ edi ], ecx
+	mov 	  DWORD PTR _edx$[ edi ], edx
+
+        pop       edx
+        pop       ecx
+        pop       ebx
+        pop       edi
+
+        mov       esp, ebp
+        pop       ebp
+        ret
+
+___kmp_x86_cpuid ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_test_then_add32
+;
+; kmp_int32
+; __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+PUBLIC  ___kmp_test_then_add32
+_p$ = 4
+_d$ = 8
+_TEXT   SEGMENT
+        ALIGN 16
+___kmp_test_then_add32 PROC NEAR
+
+        mov     eax, DWORD PTR _d$[esp]
+        mov     ecx, DWORD PTR _p$[esp]
+lock    xadd    DWORD PTR [ecx], eax
+        ret
+
+___kmp_test_then_add32 ENDP
+_TEXT   ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store8
+;
+; kmp_int8
+; __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+PUBLIC  ___kmp_compare_and_store8
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store8 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       al, BYTE PTR _cv$[esp]
+        mov       dl, BYTE PTR _sv$[esp]
+lock    cmpxchg   BYTE PTR [ecx], dl
+        sete      al           ; if al == [ecx] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        ret
+
+___kmp_compare_and_store8 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store16
+;
+; kmp_int16
+; __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+PUBLIC  ___kmp_compare_and_store16
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store16 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       ax, WORD PTR _cv$[esp]
+        mov       dx, WORD PTR _sv$[esp]
+lock    cmpxchg   WORD PTR [ecx], dx
+        sete      al           ; if ax == [ecx] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        ret
+
+___kmp_compare_and_store16 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store32
+;
+; kmp_int32
+; __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+PUBLIC  ___kmp_compare_and_store32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store32 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       eax, DWORD PTR _cv$[esp]
+        mov       edx, DWORD PTR _sv$[esp]
+lock    cmpxchg   DWORD PTR [ecx], edx
+        sete      al           ; if eax == [ecx] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        ret
+
+___kmp_compare_and_store32 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store64
+;
+; kmp_int32
+; __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+PUBLIC  ___kmp_compare_and_store64
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 8
+_cv_low$ = 12
+_cv_high$ = 16
+_sv_low$ = 20
+_sv_high$ = 24
+
+___kmp_compare_and_store64 PROC NEAR
+
+        push      ebp
+        mov       ebp, esp
+        push      ebx
+        push      edi
+        mov       edi, DWORD PTR _p$[ebp]
+        mov       eax, DWORD PTR _cv_low$[ebp]
+        mov       edx, DWORD PTR _cv_high$[ebp]
+        mov       ebx, DWORD PTR _sv_low$[ebp]
+        mov       ecx, DWORD PTR _sv_high$[ebp]
+lock    cmpxchg8b QWORD PTR [edi]
+        sete      al           ; if edx:eax == [edi] set al = 1 else set al = 0
+        and       eax, 1       ; sign extend previous instruction
+        pop       edi
+        pop       ebx
+        mov       esp, ebp
+        pop       ebp
+        ret
+
+___kmp_compare_and_store64 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_fixed8
+;
+; kmp_int8
+; __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+PUBLIC  ___kmp_xchg_fixed8
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_d$ = 8
+
+___kmp_xchg_fixed8 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       al,  BYTE PTR _d$[esp]
+lock    xchg      BYTE PTR [ecx], al
+        ret
+
+___kmp_xchg_fixed8 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_fixed16
+;
+; kmp_int16
+; __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+PUBLIC  ___kmp_xchg_fixed16
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_d$ = 8
+
+___kmp_xchg_fixed16 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       ax,  WORD PTR  _d$[esp]
+lock    xchg      WORD PTR [ecx], ax
+        ret
+
+___kmp_xchg_fixed16 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_fixed32
+;
+; kmp_int32
+; __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+PUBLIC  ___kmp_xchg_fixed32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_d$ = 8
+
+___kmp_xchg_fixed32 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       eax, DWORD PTR _d$[esp]
+lock    xchg      DWORD PTR [ecx], eax
+        ret
+
+___kmp_xchg_fixed32 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_real32
+;
+; kmp_real32
+; __kmp_xchg_real32( volatile kmp_real32 *p, kmp_real32 d );
+PUBLIC  ___kmp_xchg_real32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 8
+_d$ = 12
+_old_value$ = -4
+
+___kmp_xchg_real32 PROC NEAR
+
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 4
+        push    esi
+        mov     esi, DWORD PTR _p$[ebp]
+
+        fld     DWORD PTR [esi]
+                        ;; load <addr>
+        fst     DWORD PTR _old_value$[ebp]
+                        ;; store into old_value
+
+        mov     eax, DWORD PTR _d$[ebp]
+
+lock    xchg    DWORD PTR [esi], eax
+
+        fld     DWORD PTR _old_value$[ebp]
+                        ;; return old_value
+        pop     esi
+        mov     esp, ebp
+        pop     ebp
+        ret
+
+___kmp_xchg_real32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store_ret8
+;
+; kmp_int8
+; __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+PUBLIC  ___kmp_compare_and_store_ret8
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store_ret8 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       al, BYTE PTR _cv$[esp]
+        mov       dl, BYTE PTR _sv$[esp]
+lock    cmpxchg   BYTE PTR [ecx], dl
+        ret
+
+___kmp_compare_and_store_ret8 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store_ret16
+;
+; kmp_int16
+; __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+PUBLIC  ___kmp_compare_and_store_ret16
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store_ret16 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       ax, WORD PTR _cv$[esp]
+        mov       dx, WORD PTR _sv$[esp]
+lock    cmpxchg   WORD PTR [ecx], dx
+        ret
+
+___kmp_compare_and_store_ret16 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store_ret32
+;
+; kmp_int32
+; __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+PUBLIC  ___kmp_compare_and_store_ret32
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+_cv$ = 8
+_sv$ = 12
+
+___kmp_compare_and_store_ret32 PROC NEAR
+
+        mov       ecx, DWORD PTR _p$[esp]
+        mov       eax, DWORD PTR _cv$[esp]
+        mov       edx, DWORD PTR _sv$[esp]
+lock    cmpxchg   DWORD PTR [ecx], edx
+        ret
+
+___kmp_compare_and_store_ret32 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_compare_and_store_ret64
+;
+; kmp_int64
+; __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+PUBLIC  ___kmp_compare_and_store_ret64
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 8
+_cv_low$ = 12
+_cv_high$ = 16
+_sv_low$ = 20
+_sv_high$ = 24
+
+___kmp_compare_and_store_ret64 PROC NEAR
+
+        push      ebp
+        mov       ebp, esp
+        push      ebx
+        push      edi
+        mov       edi, DWORD PTR _p$[ebp]
+        mov       eax, DWORD PTR _cv_low$[ebp]
+        mov       edx, DWORD PTR _cv_high$[ebp]
+        mov       ebx, DWORD PTR _sv_low$[ebp]
+        mov       ecx, DWORD PTR _sv_high$[ebp]
+lock    cmpxchg8b QWORD PTR [edi]
+        pop       edi
+        pop       ebx
+        mov       esp, ebp
+        pop       ebp
+        ret
+
+___kmp_compare_and_store_ret64 ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_load_x87_fpu_control_word
+;
+; void
+; __kmp_load_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;       p:      4(%esp)
+PUBLIC  ___kmp_load_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+
+___kmp_load_x87_fpu_control_word PROC NEAR
+
+        mov       eax, DWORD PTR _p$[esp]
+        fldcw     WORD PTR [eax]
+        ret
+
+___kmp_load_x87_fpu_control_word ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_store_x87_fpu_control_word
+;
+; void
+; __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;       p:      4(%esp)
+PUBLIC  ___kmp_store_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+_p$ = 4
+
+___kmp_store_x87_fpu_control_word PROC NEAR
+
+        mov       eax, DWORD PTR _p$[esp]
+        fstcw     WORD PTR [eax]
+        ret
+
+___kmp_store_x87_fpu_control_word ENDP
+_TEXT     ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_clear_x87_fpu_status_word
+;
+; void
+; __kmp_clear_x87_fpu_status_word();
+PUBLIC  ___kmp_clear_x87_fpu_status_word
+_TEXT   SEGMENT
+        ALIGN 16
+
+___kmp_clear_x87_fpu_status_word PROC NEAR
+
+        fnclex
+        ret
+
+___kmp_clear_x87_fpu_status_word ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_invoke_microtask
+;
+; typedef void  (*microtask_t)( int *gtid, int *tid, ... );
+;
+; int
+; __kmp_invoke_microtask( microtask_t pkfn,
+;                         int gtid, int tid,
+;                         int argc, void *p_argv[] )
+PUBLIC  ___kmp_invoke_microtask
+_TEXT   SEGMENT
+        ALIGN 16
+_pkfn$ = 8
+_gtid$ = 12
+_tid$ = 16
+_argc$ = 20
+_argv$ = 24
+if OMPT_SUPPORT
+_exit_frame$ = 28
+endif
+_i$ = -8
+_stk_adj$ = -16
+_vptr$ = -12
+_qptr$ = -4
+
+___kmp_invoke_microtask PROC NEAR
+; Line 102
+        push    ebp
+        mov     ebp, esp
+        sub     esp, 16                                 ; 00000010H
+        push    ebx
+        push    esi
+        push    edi
+if OMPT_SUPPORT
+        mov     eax, DWORD PTR _exit_frame$[ebp]
+        mov     DWORD PTR [eax], ebp
+endif
+; Line 114
+        mov     eax, DWORD PTR _argc$[ebp]
+        mov     DWORD PTR _i$[ebp], eax
+
+;; ------------------------------------------------------------
+	lea     edx, DWORD PTR [eax*4+8]
+	mov     ecx, esp                                ; Save current SP into ECX
+	mov	eax,edx		; Save the size of the args in eax
+	sub	ecx,edx		; esp-((#args+2)*4) -> ecx -- without mods, stack ptr would be this
+	mov	edx,ecx		; Save to edx
+	and	ecx,-128	; Mask off 7 bits
+	sub	edx,ecx		; Amount to subtract from esp
+	sub	esp,edx		; Prepare stack ptr-- Now it will be aligned on 128-byte boundary at the call
+
+	add	edx,eax		; Calculate total size of the stack decrement.
+        mov     DWORD PTR _stk_adj$[ebp], edx
+;; ------------------------------------------------------------
+
+        jmp     SHORT $L22237
+$L22238:
+        mov     ecx, DWORD PTR _i$[ebp]
+        sub     ecx, 1
+        mov     DWORD PTR _i$[ebp], ecx
+$L22237:
+        cmp     DWORD PTR _i$[ebp], 0
+        jle     SHORT $L22239
+; Line 116
+        mov     edx, DWORD PTR _i$[ebp]
+        mov     eax, DWORD PTR _argv$[ebp]
+        mov     ecx, DWORD PTR [eax+edx*4-4]
+        mov     DWORD PTR _vptr$[ebp], ecx
+; Line 123
+        mov     eax, DWORD PTR _vptr$[ebp]
+; Line 124
+        push    eax
+; Line 127
+        jmp     SHORT $L22238
+$L22239:
+; Line 129
+        lea     edx, DWORD PTR _tid$[ebp]
+        mov     DWORD PTR _vptr$[ebp], edx
+; Line 130
+        lea     eax, DWORD PTR _gtid$[ebp]
+        mov     DWORD PTR _qptr$[ebp], eax
+; Line 143
+        mov     eax, DWORD PTR _vptr$[ebp]
+; Line 144
+        push    eax
+; Line 145
+        mov     eax, DWORD PTR _qptr$[ebp]
+; Line 146
+        push    eax
+; Line 147
+        call    DWORD PTR _pkfn$[ebp]
+; Line 148
+        add     esp, DWORD PTR _stk_adj$[ebp]
+; Line 152
+        mov     eax, 1
+; Line 153
+        pop     edi
+        pop     esi
+        pop     ebx
+        mov     esp, ebp
+        pop     ebp
+        ret     0
+___kmp_invoke_microtask ENDP
+_TEXT   ENDS
+
+endif
+
+; ==================================== Intel(R) 64 ===================================
+
+ifdef _M_AMD64
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_x86_cpuid
+;
+; void
+; __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
+;
+; parameters:
+;	mode:		ecx
+;	mode2:		edx
+;	cpuid_buffer: 	r8
+PUBLIC  __kmp_x86_cpuid
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_x86_cpuid PROC FRAME ;NEAR
+
+        push      rbp
+        .pushreg  rbp
+        mov       rbp, rsp
+        .setframe rbp, 0
+        push      rbx				; callee-save register
+        .pushreg  rbx
+        .ENDPROLOG
+
+	mov	  r10, r8                       ; p parameter
+        mov	  eax, ecx			; mode parameter
+        mov	  ecx, edx                      ; mode2 parameter
+	cpuid					; Query the CPUID for the current processor
+
+	mov 	  DWORD PTR 0[ r10 ], eax	; store results into buffer
+	mov 	  DWORD PTR 4[ r10 ], ebx
+	mov 	  DWORD PTR 8[ r10 ], ecx
+	mov 	  DWORD PTR 12[ r10 ], edx
+
+        pop       rbx				; callee-save register
+        mov       rsp, rbp
+        pop       rbp
+        ret
+
+__kmp_x86_cpuid ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_test_then_add32
+;
+; kmp_int32
+; __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	edx
+;
+; return: 	eax
+PUBLIC  __kmp_test_then_add32
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_test_then_add32 PROC ;NEAR
+
+        mov     eax, edx
+lock    xadd    DWORD PTR [rcx], eax
+        ret
+
+__kmp_test_then_add32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_test_then_add64
+;
+; kmp_int32
+; __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	rdx
+;
+; return: 	rax
+PUBLIC  __kmp_test_then_add64
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_test_then_add64 PROC ;NEAR
+
+        mov     rax, rdx
+lock    xadd    QWORD PTR [rcx], rax
+        ret
+
+__kmp_test_then_add64 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store8
+;
+; kmp_int8
+; __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+PUBLIC  __kmp_compare_and_store8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store8 PROC ;NEAR
+
+        mov       al, dl	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   BYTE PTR [rcx], dl
+        sete      al           	; if al == [rcx] set al = 1 else set al = 0
+        and       rax, 1       	; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store16
+;
+; kmp_int16
+; __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+PUBLIC  __kmp_compare_and_store16
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store16 PROC ;NEAR
+
+        mov       ax, dx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   WORD PTR [rcx], dx
+        sete      al           	; if ax == [rcx] set al = 1 else set al = 0
+        and       rax, 1       	; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store16 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store32
+;
+; kmp_int32
+; __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+PUBLIC  __kmp_compare_and_store32
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store32 PROC ;NEAR
+
+        mov       eax, edx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   DWORD PTR [rcx], edx
+        sete      al           	; if eax == [rcx] set al = 1 else set al = 0
+        and       rax, 1       	; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store32 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store64
+;
+; kmp_int32
+; __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+; parameters:
+;	p:	rcx
+;	cv:	rdx
+;	sv:	r8
+;
+; return:	eax
+PUBLIC  __kmp_compare_and_store64
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store64 PROC ;NEAR
+
+        mov       rax, rdx	; "cv"
+	mov	  rdx, r8	; "sv"
+lock    cmpxchg   QWORD PTR [rcx], rdx
+        sete      al           ; if rax == [rcx] set al = 1 else set al = 0
+        and       rax, 1       ; sign extend previous instruction
+        ret
+
+__kmp_compare_and_store64 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_fixed8
+;
+; kmp_int8
+; __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	dl
+;
+; return: 	al
+PUBLIC  __kmp_xchg_fixed8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_xchg_fixed8 PROC ;NEAR
+
+        mov       al,  dl
+lock    xchg      BYTE PTR [rcx], al
+        ret
+
+__kmp_xchg_fixed8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_fixed16
+;
+; kmp_int16
+; __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	dx
+;
+; return: 	ax
+PUBLIC  __kmp_xchg_fixed16
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_xchg_fixed16 PROC ;NEAR
+
+        mov       ax,  dx
+lock    xchg      WORD PTR [rcx], ax
+        ret
+
+__kmp_xchg_fixed16 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_fixed32
+;
+; kmp_int32
+; __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	edx
+;
+; return: 	eax
+PUBLIC  __kmp_xchg_fixed32
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_fixed32 PROC ;NEAR
+
+        mov     eax, edx
+lock    xchg    DWORD PTR [rcx], eax
+        ret
+
+__kmp_xchg_fixed32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION ___kmp_xchg_fixed64
+;
+; kmp_int64
+; __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	rdx
+;
+; return: 	rax
+PUBLIC  __kmp_xchg_fixed64
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_fixed64 PROC ;NEAR
+
+        mov     rax, rdx
+lock    xchg    QWORD PTR [rcx], rax
+        ret
+
+__kmp_xchg_fixed64 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store_ret8
+;
+; kmp_int8
+; __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+PUBLIC  __kmp_compare_and_store_ret8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret8 PROC ;NEAR
+        mov       al, dl	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   BYTE PTR [rcx], dl
+                        ; Compare AL with [rcx].  If equal set
+                        ; ZF and exchange DL with [rcx].  Else, clear
+                        ; ZF and load [rcx] into AL.
+        ret
+
+__kmp_compare_and_store_ret8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store_ret16
+;
+; kmp_int16
+; __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+PUBLIC  __kmp_compare_and_store_ret16
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret16 PROC ;NEAR
+
+        mov       ax, dx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   WORD PTR [rcx], dx
+        ret
+
+__kmp_compare_and_store_ret16 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store_ret32
+;
+; kmp_int32
+; __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	eax
+PUBLIC  __kmp_compare_and_store_ret32
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret32 PROC ;NEAR
+
+        mov       eax, edx	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   DWORD PTR [rcx], edx
+        ret
+
+__kmp_compare_and_store_ret32 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store_ret64
+;
+; kmp_int64
+; __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
+; parameters:
+;	p:	rcx
+;	cv:	rdx
+;	sv:	r8
+;
+; return:	rax
+PUBLIC  __kmp_compare_and_store_ret64
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_ret64 PROC ;NEAR
+
+        mov       rax, rdx	; "cv"
+	mov	  rdx, r8	; "sv"
+lock    cmpxchg   QWORD PTR [rcx], rdx
+        ret
+
+__kmp_compare_and_store_ret64 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_compare_and_store_loop8
+;
+; kmp_int8
+; __kmp_compare_and_store_loop8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
+; parameters:
+;	p:	rcx
+;	cv:	edx
+;	sv:	r8d
+;
+; return:	al
+PUBLIC  __kmp_compare_and_store_loop8
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_compare_and_store_loop8 PROC ;NEAR
+$__kmp_loop:
+        mov       al, dl	; "cv"
+	mov	  edx, r8d	; "sv"
+lock    cmpxchg   BYTE PTR [rcx], dl
+                        ; Compare AL with [rcx].  If equal set
+                        ; ZF and exchange DL with [rcx].  Else, clear
+                        ; ZF and load [rcx] into AL.
+        jz     	SHORT $__kmp_success
+
+        db      0f3H
+        db      090H    		; pause
+
+	jmp	SHORT $__kmp_loop
+
+$__kmp_success:
+        ret
+
+__kmp_compare_and_store_loop8 ENDP
+_TEXT     ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_xchg_real32
+;
+; kmp_real32
+; __kmp_xchg_real32( volatile kmp_real32 *p, kmp_real32 d );
+;
+; parameters:
+;	p:	rcx
+;       d:	xmm1 (lower 4 bytes)
+;
+; return:	xmm0 (lower 4 bytes)
+PUBLIC  __kmp_xchg_real32
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_real32 PROC ;NEAR
+
+	movd	eax, xmm1		; load d
+
+lock    xchg    DWORD PTR [rcx], eax
+
+	movd	xmm0, eax		; load old value into return register
+        ret
+
+__kmp_xchg_real32 ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_xchg_real64
+;
+; kmp_real64
+; __kmp_xchg_real64( volatile kmp_real64 *p, kmp_real64 d );
+;
+; parameters:
+;	p:	rcx
+;	d:	xmm1 (lower 8 bytes)
+;
+; return:	xmm0 (lower 8 bytes)
+PUBLIC  __kmp_xchg_real64
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_xchg_real64 PROC ;NEAR
+
+	movd	rax, xmm1		; load "d"
+
+lock    xchg    QWORD PTR [rcx], rax
+
+	movd	xmm0, rax		; load old value into return register
+        ret
+
+__kmp_xchg_real64 ENDP
+_TEXT   ENDS
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_load_x87_fpu_control_word
+;
+; void
+; __kmp_load_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;	p:	rcx
+PUBLIC  __kmp_load_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_load_x87_fpu_control_word PROC ;NEAR
+
+        fldcw   WORD PTR [rcx]
+        ret
+
+__kmp_load_x87_fpu_control_word ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_store_x87_fpu_control_word
+;
+; void
+; __kmp_store_x87_fpu_control_word( kmp_int16 *p );
+;
+; parameters:
+;	p:	rcx
+PUBLIC  __kmp_store_x87_fpu_control_word
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_store_x87_fpu_control_word PROC ;NEAR
+
+        fstcw   WORD PTR [rcx]
+        ret
+
+__kmp_store_x87_fpu_control_word ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_clear_x87_fpu_status_word
+;
+; void
+; __kmp_clear_x87_fpu_status_word()
+PUBLIC  __kmp_clear_x87_fpu_status_word
+_TEXT   SEGMENT
+        ALIGN 16
+__kmp_clear_x87_fpu_status_word PROC ;NEAR
+
+        fnclex
+        ret
+
+__kmp_clear_x87_fpu_status_word ENDP
+_TEXT   ENDS
+
+
+;------------------------------------------------------------------------
+; FUNCTION __kmp_invoke_microtask
+;
+; typedef void  (*microtask_t)( int *gtid, int *tid, ... );
+;
+; int
+; __kmp_invoke_microtask( microtask_t pkfn,
+;                         int gtid, int tid,
+;                         int argc, void *p_argv[] ) {
+;
+;     (*pkfn) ( &gtid, &tid, argv[0], ... );
+;     return 1;
+; }
+;
+; note:
+;      just before call to pkfn must have rsp 128-byte aligned for compiler
+;
+; parameters:
+;      rcx:   pkfn	16[rbp]
+;      edx:   gtid	24[rbp]
+;      r8d:   tid	32[rbp]
+;      r9d:   argc	40[rbp]
+;      [st]:  p_argv	48[rbp]
+;
+; reg temps:
+;      rax:   used all over the place
+;      rdx:   used all over the place
+;      rcx:   used as argument counter for push parms loop
+;      r10:   used to hold pkfn function pointer argument
+;
+; return:      eax    (always 1/TRUE)
+$_pkfn   = 16
+$_gtid   = 24
+$_tid    = 32
+$_argc   = 40
+$_p_argv = 48
+if OMPT_SUPPORT
+$_exit_frame = 56
+endif
+
+PUBLIC  __kmp_invoke_microtask
+_TEXT   SEGMENT
+        ALIGN 16
+
+__kmp_invoke_microtask PROC FRAME ;NEAR
+	mov	QWORD PTR 16[rsp], rdx	; home gtid parameter
+	mov 	QWORD PTR 24[rsp], r8	; home tid parameter
+        push    rbp		; save base pointer
+        .pushreg rbp
+	sub	rsp, 0		; no fixed allocation necessary - end prolog
+
+        lea     rbp, QWORD PTR [rsp]   	; establish the base pointer
+        .setframe rbp, 0
+        .ENDPROLOG
+if OMPT_SUPPORT
+        mov     rax, QWORD PTR $_exit_frame[rbp]
+        mov     QWORD PTR [rax], rbp
+endif
+	mov	r10, rcx	; save pkfn pointer for later
+
+;; ------------------------------------------------------------
+        mov     rax, r9		; rax <= argc
+        cmp     rax, 2
+        jge     SHORT $_kmp_invoke_stack_align
+        mov     rax, 2          ; set 4 homes if less than 2 parms
+$_kmp_invoke_stack_align:
+	lea     rdx, QWORD PTR [rax*8+16] ; rax <= (argc + 2) * 8
+	mov     rax, rsp        ; Save current SP into rax
+	sub	rax, rdx	; rsp - ((argc+2)*8) -> rax
+				; without align, rsp would be this
+	and     rax, -128       ; Mask off 7 bits (128-byte align)
+	add     rax, rdx        ; add space for push's in a loop below
+	mov     rsp, rax        ; Prepare the stack ptr
+				; Now it will align to 128-byte at the call
+;; ------------------------------------------------------------
+        			; setup pkfn parameter stack
+	mov	rax, r9		; rax <= argc
+	shl	rax, 3		; rax <= argc*8
+	mov	rdx, QWORD PTR $_p_argv[rbp]	; rdx <= p_argv
+	add	rdx, rax	; rdx <= &p_argv[argc]
+	mov	rcx, r9		; rcx <= argc
+	jecxz	SHORT $_kmp_invoke_pass_parms	; nothing to push if argc=0
+	cmp	ecx, 1		; if argc=1 branch ahead
+	je	SHORT $_kmp_invoke_one_parm
+	sub	ecx, 2		; if argc=2 branch ahead, subtract two from
+	je	SHORT $_kmp_invoke_two_parms
+
+$_kmp_invoke_push_parms:	; push last - 5th parms to pkfn on stack
+	sub	rdx, 8		; decrement p_argv pointer to previous parm
+	mov 	r8, QWORD PTR [rdx] ; r8 <= p_argv[rcx-1]
+	push	r8		; push p_argv[rcx-1] onto stack (reverse order)
+	sub	ecx, 1
+	jecxz	SHORT $_kmp_invoke_two_parms
+	jmp	SHORT $_kmp_invoke_push_parms
+
+$_kmp_invoke_two_parms:
+	sub	rdx, 8		; put 4th parm to pkfn in r9
+	mov	r9, QWORD PTR [rdx] ; r9 <= p_argv[1]
+
+$_kmp_invoke_one_parm:
+        sub	rdx, 8		; put 3rd parm to pkfn in r8
+	mov	r8, QWORD PTR [rdx] ; r8 <= p_argv[0]
+
+$_kmp_invoke_pass_parms:	; put 1st & 2nd parms to pkfn in registers
+	lea	rdx, QWORD PTR $_tid[rbp]  ; rdx <= &tid (2nd parm to pkfn)
+	lea	rcx, QWORD PTR $_gtid[rbp] ; rcx <= &gtid (1st parm to pkfn)
+        sub     rsp, 32         ; add stack space for first four parms
+	mov	rax, r10	; rax <= pkfn
+	call	rax		; call (*pkfn)()
+	mov	rax, 1		; move 1 into return register;
+
+        lea     rsp, QWORD PTR [rbp]	; restore stack pointer
+
+;	add	rsp, 0		; no fixed allocation necessary - start epilog
+        pop     rbp		; restore frame pointer
+        ret
+__kmp_invoke_microtask ENDP
+_TEXT   ENDS
+
+endif
+
+END
diff --git a/final/runtime/src/z_Windows_NT-586_util.cpp b/final/runtime/src/z_Windows_NT-586_util.cpp
new file mode 100644
index 0000000..b3728a5
--- /dev/null
+++ b/final/runtime/src/z_Windows_NT-586_util.cpp
@@ -0,0 +1,135 @@
+/*
+ * z_Windows_NT-586_util.cpp -- platform specific routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+/* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
+   use compare_and_store for these routines */
+
+kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value | d;
+
+  while (!__kmp_compare_and_store8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int8 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value & d;
+
+  while (!__kmp_compare_and_store8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 d) {
+  kmp_uint32 old_value, new_value;
+
+  old_value = TCR_4(*p);
+  new_value = old_value | d;
+
+  while (!__kmp_compare_and_store32((volatile kmp_int32 *)p, old_value,
+                                    new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_4(*p);
+    new_value = old_value | d;
+  }
+  return old_value;
+}
+
+kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) {
+  kmp_uint32 old_value, new_value;
+
+  old_value = TCR_4(*p);
+  new_value = old_value & d;
+
+  while (!__kmp_compare_and_store32((volatile kmp_int32 *)p, old_value,
+                                    new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_4(*p);
+    new_value = old_value & d;
+  }
+  return old_value;
+}
+
+kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 d) {
+  kmp_int64 old_value, new_value;
+
+  old_value = TCR_1(*p);
+  new_value = old_value + d;
+  while (!__kmp_compare_and_store8(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_1(*p);
+    new_value = old_value + d;
+  }
+  return old_value;
+}
+
+#if KMP_ARCH_X86
+kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 d) {
+  kmp_int64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value + d;
+  while (!__kmp_compare_and_store64(p, old_value, new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value + d;
+  }
+  return old_value;
+}
+#endif /* KMP_ARCH_X86 */
+
+kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 d) {
+  kmp_uint64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value | d;
+  while (!__kmp_compare_and_store64((volatile kmp_int64 *)p, old_value,
+                                    new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value | d;
+  }
+
+  return old_value;
+}
+
+kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) {
+  kmp_uint64 old_value, new_value;
+
+  old_value = TCR_8(*p);
+  new_value = old_value & d;
+  while (!__kmp_compare_and_store64((volatile kmp_int64 *)p, old_value,
+                                    new_value)) {
+    KMP_CPU_PAUSE();
+    old_value = TCR_8(*p);
+    new_value = old_value & d;
+  }
+
+  return old_value;
+}
+
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
diff --git a/final/runtime/src/z_Windows_NT_util.cpp b/final/runtime/src/z_Windows_NT_util.cpp
new file mode 100644
index 0000000..c149dda
--- /dev/null
+++ b/final/runtime/src/z_Windows_NT_util.cpp
@@ -0,0 +1,1621 @@
+/*
+ * z_Windows_NT_util.cpp -- platform specific routines.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp.h"
+#include "kmp_affinity.h"
+#include "kmp_i18n.h"
+#include "kmp_io.h"
+#include "kmp_itt.h"
+#include "kmp_wait_release.h"
+
+/* This code is related to NtQuerySystemInformation() function. This function
+   is used in the Load balance algorithm for OMP_DYNAMIC=true to find the
+   number of running threads in the system. */
+
+#include <ntsecapi.h> // UNICODE_STRING
+#include <ntstatus.h>
+
+enum SYSTEM_INFORMATION_CLASS {
+  SystemProcessInformation = 5
+}; // SYSTEM_INFORMATION_CLASS
+
+struct CLIENT_ID {
+  HANDLE UniqueProcess;
+  HANDLE UniqueThread;
+}; // struct CLIENT_ID
+
+enum THREAD_STATE {
+  StateInitialized,
+  StateReady,
+  StateRunning,
+  StateStandby,
+  StateTerminated,
+  StateWait,
+  StateTransition,
+  StateUnknown
+}; // enum THREAD_STATE
+
+struct VM_COUNTERS {
+  SIZE_T PeakVirtualSize;
+  SIZE_T VirtualSize;
+  ULONG PageFaultCount;
+  SIZE_T PeakWorkingSetSize;
+  SIZE_T WorkingSetSize;
+  SIZE_T QuotaPeakPagedPoolUsage;
+  SIZE_T QuotaPagedPoolUsage;
+  SIZE_T QuotaPeakNonPagedPoolUsage;
+  SIZE_T QuotaNonPagedPoolUsage;
+  SIZE_T PagefileUsage;
+  SIZE_T PeakPagefileUsage;
+  SIZE_T PrivatePageCount;
+}; // struct VM_COUNTERS
+
+struct SYSTEM_THREAD {
+  LARGE_INTEGER KernelTime;
+  LARGE_INTEGER UserTime;
+  LARGE_INTEGER CreateTime;
+  ULONG WaitTime;
+  LPVOID StartAddress;
+  CLIENT_ID ClientId;
+  DWORD Priority;
+  LONG BasePriority;
+  ULONG ContextSwitchCount;
+  THREAD_STATE State;
+  ULONG WaitReason;
+}; // SYSTEM_THREAD
+
+KMP_BUILD_ASSERT(offsetof(SYSTEM_THREAD, KernelTime) == 0);
+#if KMP_ARCH_X86
+KMP_BUILD_ASSERT(offsetof(SYSTEM_THREAD, StartAddress) == 28);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_THREAD, State) == 52);
+#else
+KMP_BUILD_ASSERT(offsetof(SYSTEM_THREAD, StartAddress) == 32);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_THREAD, State) == 68);
+#endif
+
+struct SYSTEM_PROCESS_INFORMATION {
+  ULONG NextEntryOffset;
+  ULONG NumberOfThreads;
+  LARGE_INTEGER Reserved[3];
+  LARGE_INTEGER CreateTime;
+  LARGE_INTEGER UserTime;
+  LARGE_INTEGER KernelTime;
+  UNICODE_STRING ImageName;
+  DWORD BasePriority;
+  HANDLE ProcessId;
+  HANDLE ParentProcessId;
+  ULONG HandleCount;
+  ULONG Reserved2[2];
+  VM_COUNTERS VMCounters;
+  IO_COUNTERS IOCounters;
+  SYSTEM_THREAD Threads[1];
+}; // SYSTEM_PROCESS_INFORMATION
+typedef SYSTEM_PROCESS_INFORMATION *PSYSTEM_PROCESS_INFORMATION;
+
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, NextEntryOffset) == 0);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, CreateTime) == 32);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, ImageName) == 56);
+#if KMP_ARCH_X86
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, ProcessId) == 68);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, HandleCount) == 76);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, VMCounters) == 88);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, IOCounters) == 136);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, Threads) == 184);
+#else
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, ProcessId) == 80);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, HandleCount) == 96);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, VMCounters) == 112);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, IOCounters) == 208);
+KMP_BUILD_ASSERT(offsetof(SYSTEM_PROCESS_INFORMATION, Threads) == 256);
+#endif
+
+typedef NTSTATUS(NTAPI *NtQuerySystemInformation_t)(SYSTEM_INFORMATION_CLASS,
+                                                    PVOID, ULONG, PULONG);
+NtQuerySystemInformation_t NtQuerySystemInformation = NULL;
+
+HMODULE ntdll = NULL;
+
+/* End of NtQuerySystemInformation()-related code */
+
+static HMODULE kernel32 = NULL;
+
+#if KMP_HANDLE_SIGNALS
+typedef void (*sig_func_t)(int);
+static sig_func_t __kmp_sighldrs[NSIG];
+static int __kmp_siginstalled[NSIG];
+#endif
+
+#if KMP_USE_MONITOR
+static HANDLE __kmp_monitor_ev;
+#endif
+static kmp_int64 __kmp_win32_time;
+double __kmp_win32_tick;
+
+int __kmp_init_runtime = FALSE;
+CRITICAL_SECTION __kmp_win32_section;
+
+void __kmp_win32_mutex_init(kmp_win32_mutex_t *mx) {
+  InitializeCriticalSection(&mx->cs);
+#if USE_ITT_BUILD
+  __kmp_itt_system_object_created(&mx->cs, "Critical Section");
+#endif /* USE_ITT_BUILD */
+}
+
+void __kmp_win32_mutex_destroy(kmp_win32_mutex_t *mx) {
+  DeleteCriticalSection(&mx->cs);
+}
+
+void __kmp_win32_mutex_lock(kmp_win32_mutex_t *mx) {
+  EnterCriticalSection(&mx->cs);
+}
+
+int __kmp_win32_mutex_trylock(kmp_win32_mutex_t *mx) {
+  return TryEnterCriticalSection(&mx->cs);
+}
+
+void __kmp_win32_mutex_unlock(kmp_win32_mutex_t *mx) {
+  LeaveCriticalSection(&mx->cs);
+}
+
+void __kmp_win32_cond_init(kmp_win32_cond_t *cv) {
+  cv->waiters_count_ = 0;
+  cv->wait_generation_count_ = 0;
+  cv->release_count_ = 0;
+
+  /* Initialize the critical section */
+  __kmp_win32_mutex_init(&cv->waiters_count_lock_);
+
+  /* Create a manual-reset event. */
+  cv->event_ = CreateEvent(NULL, // no security
+                           TRUE, // manual-reset
+                           FALSE, // non-signaled initially
+                           NULL); // unnamed
+#if USE_ITT_BUILD
+  __kmp_itt_system_object_created(cv->event_, "Event");
+#endif /* USE_ITT_BUILD */
+}
+
+void __kmp_win32_cond_destroy(kmp_win32_cond_t *cv) {
+  __kmp_win32_mutex_destroy(&cv->waiters_count_lock_);
+  __kmp_free_handle(cv->event_);
+  memset(cv, '\0', sizeof(*cv));
+}
+
+/* TODO associate cv with a team instead of a thread so as to optimize
+   the case where we wake up a whole team */
+
+template <class C>
+static void __kmp_win32_cond_wait(kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx,
+                                  kmp_info_t *th, C *flag) {
+  int my_generation;
+  int last_waiter;
+
+  /* Avoid race conditions */
+  __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+
+  /* Increment count of waiters */
+  cv->waiters_count_++;
+
+  /* Store current generation in our activation record. */
+  my_generation = cv->wait_generation_count_;
+
+  __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
+  __kmp_win32_mutex_unlock(mx);
+
+  for (;;) {
+    int wait_done = 0;
+    DWORD res, timeout = 5000; // just tried to quess an appropriate number
+    /* Wait until the event is signaled */
+    res = WaitForSingleObject(cv->event_, timeout);
+
+    if (res == WAIT_OBJECT_0) {
+      // event signaled
+      __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+      /* Exit the loop when the <cv->event_> is signaled and there are still
+         waiting threads from this <wait_generation> that haven't been released
+         from this wait yet. */
+      wait_done = (cv->release_count_ > 0) &&
+                  (cv->wait_generation_count_ != my_generation);
+      __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
+    } else if (res == WAIT_TIMEOUT || res == WAIT_FAILED) {
+      // check if the flag and cv counters are in consistent state
+      // as MS sent us debug dump whith inconsistent state of data
+      __kmp_win32_mutex_lock(mx);
+      typename C::flag_t old_f = flag->set_sleeping();
+      if (!flag->done_check_val(old_f & ~KMP_BARRIER_SLEEP_STATE)) {
+        __kmp_win32_mutex_unlock(mx);
+        continue;
+      }
+      // condition fulfilled, exiting
+      old_f = flag->unset_sleeping();
+      KMP_DEBUG_ASSERT(old_f & KMP_BARRIER_SLEEP_STATE);
+      TCW_PTR(th->th.th_sleep_loc, NULL);
+      KF_TRACE(50, ("__kmp_win32_cond_wait: exiting, condition "
+                    "fulfilled: flag's loc(%p): %u => %u\n",
+                    flag->get(), old_f, *(flag->get())));
+
+      __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+      KMP_DEBUG_ASSERT(cv->waiters_count_ > 0);
+      cv->release_count_ = cv->waiters_count_;
+      cv->wait_generation_count_++;
+      wait_done = 1;
+      __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
+
+      __kmp_win32_mutex_unlock(mx);
+    }
+    /* there used to be a semicolon after the if statement, it looked like a
+       bug, so i removed it */
+    if (wait_done)
+      break;
+  }
+
+  __kmp_win32_mutex_lock(mx);
+  __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+
+  cv->waiters_count_--;
+  cv->release_count_--;
+
+  last_waiter = (cv->release_count_ == 0);
+
+  __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
+
+  if (last_waiter) {
+    /* We're the last waiter to be notified, so reset the manual event. */
+    ResetEvent(cv->event_);
+  }
+}
+
+void __kmp_win32_cond_broadcast(kmp_win32_cond_t *cv) {
+  __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+
+  if (cv->waiters_count_ > 0) {
+    SetEvent(cv->event_);
+    /* Release all the threads in this generation. */
+
+    cv->release_count_ = cv->waiters_count_;
+
+    /* Start a new generation. */
+    cv->wait_generation_count_++;
+  }
+
+  __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
+}
+
+void __kmp_win32_cond_signal(kmp_win32_cond_t *cv) {
+  __kmp_win32_cond_broadcast(cv);
+}
+
+void __kmp_enable(int new_state) {
+  if (__kmp_init_runtime)
+    LeaveCriticalSection(&__kmp_win32_section);
+}
+
+void __kmp_disable(int *old_state) {
+  *old_state = 0;
+
+  if (__kmp_init_runtime)
+    EnterCriticalSection(&__kmp_win32_section);
+}
+
+void __kmp_suspend_initialize(void) { /* do nothing */
+}
+
+void __kmp_suspend_initialize_thread(kmp_info_t *th) {
+  int old_value = KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init);
+  int new_value = TRUE;
+  // Return if already initialized
+  if (old_value == new_value)
+    return;
+  // Wait, then return if being initialized
+  if (old_value == -1 ||
+      !__kmp_atomic_compare_store(&th->th.th_suspend_init, old_value, -1)) {
+    while (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init) != new_value) {
+      KMP_CPU_PAUSE();
+    }
+  } else {
+    // Claim to be the initializer and do initializations
+    __kmp_win32_cond_init(&th->th.th_suspend_cv);
+    __kmp_win32_mutex_init(&th->th.th_suspend_mx);
+    KMP_ATOMIC_ST_REL(&th->th.th_suspend_init, new_value);
+  }
+}
+
+void __kmp_suspend_uninitialize_thread(kmp_info_t *th) {
+  if (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init)) {
+    /* this means we have initialize the suspension pthread objects for this
+       thread in this instance of the process */
+    __kmp_win32_cond_destroy(&th->th.th_suspend_cv);
+    __kmp_win32_mutex_destroy(&th->th.th_suspend_mx);
+    KMP_ATOMIC_ST_REL(&th->th.th_suspend_init, FALSE);
+  }
+}
+
+int __kmp_try_suspend_mx(kmp_info_t *th) {
+  return __kmp_win32_mutex_trylock(&th->th.th_suspend_mx);
+}
+
+void __kmp_lock_suspend_mx(kmp_info_t *th) {
+  __kmp_win32_mutex_lock(&th->th.th_suspend_mx);
+}
+
+void __kmp_unlock_suspend_mx(kmp_info_t *th) {
+  __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+}
+
+/* This routine puts the calling thread to sleep after setting the
+   sleep bit for the indicated flag variable to true. */
+template <class C>
+static inline void __kmp_suspend_template(int th_gtid, C *flag) {
+  kmp_info_t *th = __kmp_threads[th_gtid];
+  int status;
+  typename C::flag_t old_spin;
+
+  KF_TRACE(30, ("__kmp_suspend_template: T#%d enter for flag's loc(%p)\n",
+                th_gtid, flag->get()));
+
+  __kmp_suspend_initialize_thread(th);
+  __kmp_win32_mutex_lock(&th->th.th_suspend_mx);
+
+  KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for flag's"
+                " loc(%p)\n",
+                th_gtid, flag->get()));
+
+  /* TODO: shouldn't this use release semantics to ensure that
+     __kmp_suspend_initialize_thread gets called first? */
+  old_spin = flag->set_sleeping();
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
+      __kmp_pause_status != kmp_soft_paused) {
+    flag->unset_sleeping();
+    __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+    return;
+  }
+
+  KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for flag's"
+               " loc(%p)==%d\n",
+               th_gtid, flag->get(), *(flag->get())));
+
+  if (flag->done_check_val(old_spin)) {
+    old_spin = flag->unset_sleeping();
+    KF_TRACE(5, ("__kmp_suspend_template: T#%d false alarm, reset sleep bit "
+                 "for flag's loc(%p)\n",
+                 th_gtid, flag->get()));
+  } else {
+#ifdef DEBUG_SUSPEND
+    __kmp_suspend_count++;
+#endif
+    /* Encapsulate in a loop as the documentation states that this may "with
+       low probability" return when the condition variable has not been signaled
+       or broadcast */
+    int deactivated = FALSE;
+    TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+    while (flag->is_sleeping()) {
+      KF_TRACE(15, ("__kmp_suspend_template: T#%d about to perform "
+                    "kmp_win32_cond_wait()\n",
+                    th_gtid));
+      // Mark the thread as no longer active (only in the first iteration of the
+      // loop).
+      if (!deactivated) {
+        th->th.th_active = FALSE;
+        if (th->th.th_active_in_pool) {
+          th->th.th_active_in_pool = FALSE;
+          KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+          KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+        }
+        deactivated = TRUE;
+        __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, th,
+                              flag);
+      } else {
+        __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, th,
+                              flag);
+      }
+
+#ifdef KMP_DEBUG
+      if (flag->is_sleeping()) {
+        KF_TRACE(100,
+                 ("__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid));
+      }
+#endif /* KMP_DEBUG */
+
+    } // while
+
+    // Mark the thread as active again (if it was previous marked as inactive)
+    if (deactivated) {
+      th->th.th_active = TRUE;
+      if (TCR_4(th->th.th_in_pool)) {
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
+        th->th.th_active_in_pool = TRUE;
+      }
+    }
+  }
+
+  __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+
+  KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid));
+}
+
+void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
+  __kmp_suspend_template(th_gtid, flag);
+}
+
+/* This routine signals the thread specified by target_gtid to wake up
+   after setting the sleep bit indicated by the flag argument to FALSE */
+template <class C>
+static inline void __kmp_resume_template(int target_gtid, C *flag) {
+  kmp_info_t *th = __kmp_threads[target_gtid];
+  int status;
+
+#ifdef KMP_DEBUG
+  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+
+  KF_TRACE(30, ("__kmp_resume_template: T#%d wants to wakeup T#%d enter\n",
+                gtid, target_gtid));
+
+  __kmp_suspend_initialize_thread(th);
+  __kmp_win32_mutex_lock(&th->th.th_suspend_mx);
+
+  if (!flag) { // coming from __kmp_null_resume_wrapper
+    flag = (C *)th->th.th_sleep_loc;
+  }
+
+  // First, check if the flag is null or its type has changed. If so, someone
+  // else woke it up.
+  if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type
+    // simply shows what
+    // flag was cast to
+    KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
+                 "awake: flag's loc(%p)\n",
+                 gtid, target_gtid, NULL));
+    __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+    return;
+  } else {
+    typename C::flag_t old_spin = flag->unset_sleeping();
+    if (!flag->is_sleeping_val(old_spin)) {
+      KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
+                   "awake: flag's loc(%p): %u => %u\n",
+                   gtid, target_gtid, flag->get(), old_spin, *(flag->get())));
+      __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+      return;
+    }
+  }
+  TCW_PTR(th->th.th_sleep_loc, NULL);
+  KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset sleep "
+               "bit for flag's loc(%p)\n",
+               gtid, target_gtid, flag->get()));
+
+  __kmp_win32_cond_signal(&th->th.th_suspend_cv);
+  __kmp_win32_mutex_unlock(&th->th.th_suspend_mx);
+
+  KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up"
+                " for T#%d\n",
+                gtid, target_gtid));
+}
+
+void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
+  __kmp_resume_template(target_gtid, flag);
+}
+
+void __kmp_yield() { Sleep(0); }
+
+void __kmp_gtid_set_specific(int gtid) {
+  if (__kmp_init_gtid) {
+    KA_TRACE(50, ("__kmp_gtid_set_specific: T#%d key:%d\n", gtid,
+                  __kmp_gtid_threadprivate_key));
+    if (!TlsSetValue(__kmp_gtid_threadprivate_key, (LPVOID)(gtid + 1)))
+      KMP_FATAL(TLSSetValueFailed);
+  } else {
+    KA_TRACE(50, ("__kmp_gtid_set_specific: runtime shutdown, returning\n"));
+  }
+}
+
+int __kmp_gtid_get_specific() {
+  int gtid;
+  if (!__kmp_init_gtid) {
+    KA_TRACE(50, ("__kmp_gtid_get_specific: runtime shutdown, returning "
+                  "KMP_GTID_SHUTDOWN\n"));
+    return KMP_GTID_SHUTDOWN;
+  }
+  gtid = (int)(kmp_intptr_t)TlsGetValue(__kmp_gtid_threadprivate_key);
+  if (gtid == 0) {
+    gtid = KMP_GTID_DNE;
+  } else {
+    gtid--;
+  }
+  KA_TRACE(50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
+                __kmp_gtid_threadprivate_key, gtid));
+  return gtid;
+}
+
+void __kmp_affinity_bind_thread(int proc) {
+  if (__kmp_num_proc_groups > 1) {
+    // Form the GROUP_AFFINITY struct directly, rather than filling
+    // out a bit vector and calling __kmp_set_system_affinity().
+    GROUP_AFFINITY ga;
+    KMP_DEBUG_ASSERT((proc >= 0) && (proc < (__kmp_num_proc_groups * CHAR_BIT *
+                                             sizeof(DWORD_PTR))));
+    ga.Group = proc / (CHAR_BIT * sizeof(DWORD_PTR));
+    ga.Mask = (unsigned long long)1 << (proc % (CHAR_BIT * sizeof(DWORD_PTR)));
+    ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
+
+    KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
+    if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
+      DWORD error = GetLastError();
+      if (__kmp_affinity_verbose) { // AC: continue silently if not verbose
+        kmp_msg_t err_code = KMP_ERR(error);
+        __kmp_msg(kmp_ms_warning, KMP_MSG(CantSetThreadAffMask), err_code,
+                  __kmp_msg_null);
+        if (__kmp_generate_warnings == kmp_warnings_off) {
+          __kmp_str_free(&err_code.str);
+        }
+      }
+    }
+  } else {
+    kmp_affin_mask_t *mask;
+    KMP_CPU_ALLOC_ON_STACK(mask);
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(proc, mask);
+    __kmp_set_system_affinity(mask, TRUE);
+    KMP_CPU_FREE_FROM_STACK(mask);
+  }
+}
+
+void __kmp_affinity_determine_capable(const char *env_var) {
+// All versions of Windows* OS (since Win '95) support SetThreadAffinityMask().
+
+#if KMP_GROUP_AFFINITY
+  KMP_AFFINITY_ENABLE(__kmp_num_proc_groups * sizeof(DWORD_PTR));
+#else
+  KMP_AFFINITY_ENABLE(sizeof(DWORD_PTR));
+#endif
+
+  KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                "Windows* OS affinity interface functional (mask size = "
+                "%" KMP_SIZE_T_SPEC ").\n",
+                __kmp_affin_mask_size));
+}
+
+double __kmp_read_cpu_time(void) {
+  FILETIME CreationTime, ExitTime, KernelTime, UserTime;
+  int status;
+  double cpu_time;
+
+  cpu_time = 0;
+
+  status = GetProcessTimes(GetCurrentProcess(), &CreationTime, &ExitTime,
+                           &KernelTime, &UserTime);
+
+  if (status) {
+    double sec = 0;
+
+    sec += KernelTime.dwHighDateTime;
+    sec += UserTime.dwHighDateTime;
+
+    /* Shift left by 32 bits */
+    sec *= (double)(1 << 16) * (double)(1 << 16);
+
+    sec += KernelTime.dwLowDateTime;
+    sec += UserTime.dwLowDateTime;
+
+    cpu_time += (sec * 100.0) / KMP_NSEC_PER_SEC;
+  }
+
+  return cpu_time;
+}
+
+int __kmp_read_system_info(struct kmp_sys_info *info) {
+  info->maxrss = 0; /* the maximum resident set size utilized (in kilobytes) */
+  info->minflt = 0; /* the number of page faults serviced without any I/O */
+  info->majflt = 0; /* the number of page faults serviced that required I/O */
+  info->nswap = 0; // the number of times a process was "swapped" out of memory
+  info->inblock = 0; // the number of times the file system had to perform input
+  info->oublock = 0; // number of times the file system had to perform output
+  info->nvcsw = 0; /* the number of times a context switch was voluntarily */
+  info->nivcsw = 0; /* the number of times a context switch was forced */
+
+  return 1;
+}
+
+void __kmp_runtime_initialize(void) {
+  SYSTEM_INFO info;
+  kmp_str_buf_t path;
+  UINT path_size;
+
+  if (__kmp_init_runtime) {
+    return;
+  }
+
+#if KMP_DYNAMIC_LIB
+  /* Pin dynamic library for the lifetime of application */
+  {
+    // First, turn off error message boxes
+    UINT err_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    HMODULE h;
+    BOOL ret = GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                                     GET_MODULE_HANDLE_EX_FLAG_PIN,
+                                 (LPCTSTR)&__kmp_serial_initialize, &h);
+    KMP_DEBUG_ASSERT2(h && ret, "OpenMP RTL cannot find itself loaded");
+    SetErrorMode(err_mode); // Restore error mode
+    KA_TRACE(10, ("__kmp_runtime_initialize: dynamic library pinned\n"));
+  }
+#endif
+
+  InitializeCriticalSection(&__kmp_win32_section);
+#if USE_ITT_BUILD
+  __kmp_itt_system_object_created(&__kmp_win32_section, "Critical Section");
+#endif /* USE_ITT_BUILD */
+  __kmp_initialize_system_tick();
+
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
+  if (!__kmp_cpuinfo.initialized) {
+    __kmp_query_cpuid(&__kmp_cpuinfo);
+  }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+/* Set up minimum number of threads to switch to TLS gtid */
+#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
+  // Windows* OS, static library.
+  /* New thread may use stack space previously used by another thread,
+     currently terminated. On Windows* OS, in case of static linking, we do not
+     know the moment of thread termination, and our structures (__kmp_threads
+     and __kmp_root arrays) are still keep info about dead threads. This leads
+     to problem in __kmp_get_global_thread_id() function: it wrongly finds gtid
+     (by searching through stack addresses of all known threads) for
+     unregistered foreign tread.
+
+     Setting __kmp_tls_gtid_min to 0 workarounds this problem:
+     __kmp_get_global_thread_id() does not search through stacks, but get gtid
+     from TLS immediately.
+      --ln
+  */
+  __kmp_tls_gtid_min = 0;
+#else
+  __kmp_tls_gtid_min = KMP_TLS_GTID_MIN;
+#endif
+
+  /* for the static library */
+  if (!__kmp_gtid_threadprivate_key) {
+    __kmp_gtid_threadprivate_key = TlsAlloc();
+    if (__kmp_gtid_threadprivate_key == TLS_OUT_OF_INDEXES) {
+      KMP_FATAL(TLSOutOfIndexes);
+    }
+  }
+
+  // Load ntdll.dll.
+  /* Simple GetModuleHandle( "ntdll.dl" ) is not suitable due to security issue
+     (see http://www.microsoft.com/technet/security/advisory/2269637.mspx). We
+     have to specify full path to the library. */
+  __kmp_str_buf_init(&path);
+  path_size = GetSystemDirectory(path.str, path.size);
+  KMP_DEBUG_ASSERT(path_size > 0);
+  if (path_size >= path.size) {
+    // Buffer is too short.  Expand the buffer and try again.
+    __kmp_str_buf_reserve(&path, path_size);
+    path_size = GetSystemDirectory(path.str, path.size);
+    KMP_DEBUG_ASSERT(path_size > 0);
+  }
+  if (path_size > 0 && path_size < path.size) {
+    // Now we have system directory name in the buffer.
+    // Append backslash and name of dll to form full path,
+    path.used = path_size;
+    __kmp_str_buf_print(&path, "\\%s", "ntdll.dll");
+
+    // Now load ntdll using full path.
+    ntdll = GetModuleHandle(path.str);
+  }
+
+  KMP_DEBUG_ASSERT(ntdll != NULL);
+  if (ntdll != NULL) {
+    NtQuerySystemInformation = (NtQuerySystemInformation_t)GetProcAddress(
+        ntdll, "NtQuerySystemInformation");
+  }
+  KMP_DEBUG_ASSERT(NtQuerySystemInformation != NULL);
+
+#if KMP_GROUP_AFFINITY
+  // Load kernel32.dll.
+  // Same caveat - must use full system path name.
+  if (path_size > 0 && path_size < path.size) {
+    // Truncate the buffer back to just the system path length,
+    // discarding "\\ntdll.dll", and replacing it with "kernel32.dll".
+    path.used = path_size;
+    __kmp_str_buf_print(&path, "\\%s", "kernel32.dll");
+
+    // Load kernel32.dll using full path.
+    kernel32 = GetModuleHandle(path.str);
+    KA_TRACE(10, ("__kmp_runtime_initialize: kernel32.dll = %s\n", path.str));
+
+    // Load the function pointers to kernel32.dll routines
+    // that may or may not exist on this system.
+    if (kernel32 != NULL) {
+      __kmp_GetActiveProcessorCount =
+          (kmp_GetActiveProcessorCount_t)GetProcAddress(
+              kernel32, "GetActiveProcessorCount");
+      __kmp_GetActiveProcessorGroupCount =
+          (kmp_GetActiveProcessorGroupCount_t)GetProcAddress(
+              kernel32, "GetActiveProcessorGroupCount");
+      __kmp_GetThreadGroupAffinity =
+          (kmp_GetThreadGroupAffinity_t)GetProcAddress(
+              kernel32, "GetThreadGroupAffinity");
+      __kmp_SetThreadGroupAffinity =
+          (kmp_SetThreadGroupAffinity_t)GetProcAddress(
+              kernel32, "SetThreadGroupAffinity");
+
+      KA_TRACE(10, ("__kmp_runtime_initialize: __kmp_GetActiveProcessorCount"
+                    " = %p\n",
+                    __kmp_GetActiveProcessorCount));
+      KA_TRACE(10, ("__kmp_runtime_initialize: "
+                    "__kmp_GetActiveProcessorGroupCount = %p\n",
+                    __kmp_GetActiveProcessorGroupCount));
+      KA_TRACE(10, ("__kmp_runtime_initialize:__kmp_GetThreadGroupAffinity"
+                    " = %p\n",
+                    __kmp_GetThreadGroupAffinity));
+      KA_TRACE(10, ("__kmp_runtime_initialize: __kmp_SetThreadGroupAffinity"
+                    " = %p\n",
+                    __kmp_SetThreadGroupAffinity));
+      KA_TRACE(10, ("__kmp_runtime_initialize: sizeof(kmp_affin_mask_t) = %d\n",
+                    sizeof(kmp_affin_mask_t)));
+
+      // See if group affinity is supported on this system.
+      // If so, calculate the #groups and #procs.
+      //
+      // Group affinity was introduced with Windows* 7 OS and
+      // Windows* Server 2008 R2 OS.
+      if ((__kmp_GetActiveProcessorCount != NULL) &&
+          (__kmp_GetActiveProcessorGroupCount != NULL) &&
+          (__kmp_GetThreadGroupAffinity != NULL) &&
+          (__kmp_SetThreadGroupAffinity != NULL) &&
+          ((__kmp_num_proc_groups = __kmp_GetActiveProcessorGroupCount()) >
+           1)) {
+        // Calculate the total number of active OS procs.
+        int i;
+
+        KA_TRACE(10, ("__kmp_runtime_initialize: %d processor groups"
+                      " detected\n",
+                      __kmp_num_proc_groups));
+
+        __kmp_xproc = 0;
+
+        for (i = 0; i < __kmp_num_proc_groups; i++) {
+          DWORD size = __kmp_GetActiveProcessorCount(i);
+          __kmp_xproc += size;
+          KA_TRACE(10, ("__kmp_runtime_initialize: proc group %d size = %d\n",
+                        i, size));
+        }
+      } else {
+        KA_TRACE(10, ("__kmp_runtime_initialize: %d processor groups"
+                      " detected\n",
+                      __kmp_num_proc_groups));
+      }
+    }
+  }
+  if (__kmp_num_proc_groups <= 1) {
+    GetSystemInfo(&info);
+    __kmp_xproc = info.dwNumberOfProcessors;
+  }
+#else
+  GetSystemInfo(&info);
+  __kmp_xproc = info.dwNumberOfProcessors;
+#endif /* KMP_GROUP_AFFINITY */
+
+  // If the OS said there were 0 procs, take a guess and use a value of 2.
+  // This is done for Linux* OS, also.  Do we need error / warning?
+  if (__kmp_xproc <= 0) {
+    __kmp_xproc = 2;
+  }
+
+  KA_TRACE(5,
+           ("__kmp_runtime_initialize: total processors = %d\n", __kmp_xproc));
+
+  __kmp_str_buf_free(&path);
+
+#if USE_ITT_BUILD
+  __kmp_itt_initialize();
+#endif /* USE_ITT_BUILD */
+
+  __kmp_init_runtime = TRUE;
+} // __kmp_runtime_initialize
+
+void __kmp_runtime_destroy(void) {
+  if (!__kmp_init_runtime) {
+    return;
+  }
+
+#if USE_ITT_BUILD
+  __kmp_itt_destroy();
+#endif /* USE_ITT_BUILD */
+
+  /* we can't DeleteCriticalsection( & __kmp_win32_section ); */
+  /* due to the KX_TRACE() commands */
+  KA_TRACE(40, ("__kmp_runtime_destroy\n"));
+
+  if (__kmp_gtid_threadprivate_key) {
+    TlsFree(__kmp_gtid_threadprivate_key);
+    __kmp_gtid_threadprivate_key = 0;
+  }
+
+  __kmp_affinity_uninitialize();
+  DeleteCriticalSection(&__kmp_win32_section);
+
+  ntdll = NULL;
+  NtQuerySystemInformation = NULL;
+
+#if KMP_ARCH_X86_64
+  kernel32 = NULL;
+  __kmp_GetActiveProcessorCount = NULL;
+  __kmp_GetActiveProcessorGroupCount = NULL;
+  __kmp_GetThreadGroupAffinity = NULL;
+  __kmp_SetThreadGroupAffinity = NULL;
+#endif // KMP_ARCH_X86_64
+
+  __kmp_init_runtime = FALSE;
+}
+
+void __kmp_terminate_thread(int gtid) {
+  kmp_info_t *th = __kmp_threads[gtid];
+
+  if (!th)
+    return;
+
+  KA_TRACE(10, ("__kmp_terminate_thread: kill (%d)\n", gtid));
+
+  if (TerminateThread(th->th.th_info.ds.ds_thread, (DWORD)-1) == FALSE) {
+    /* It's OK, the thread may have exited already */
+  }
+  __kmp_free_handle(th->th.th_info.ds.ds_thread);
+}
+
+void __kmp_clear_system_time(void) {
+  BOOL status;
+  LARGE_INTEGER time;
+  status = QueryPerformanceCounter(&time);
+  __kmp_win32_time = (kmp_int64)time.QuadPart;
+}
+
+void __kmp_initialize_system_tick(void) {
+  {
+    BOOL status;
+    LARGE_INTEGER freq;
+
+    status = QueryPerformanceFrequency(&freq);
+    if (!status) {
+      DWORD error = GetLastError();
+      __kmp_fatal(KMP_MSG(FunctionError, "QueryPerformanceFrequency()"),
+                  KMP_ERR(error), __kmp_msg_null);
+
+    } else {
+      __kmp_win32_tick = ((double)1.0) / (double)freq.QuadPart;
+    }
+  }
+}
+
+/* Calculate the elapsed wall clock time for the user */
+
+void __kmp_elapsed(double *t) {
+  BOOL status;
+  LARGE_INTEGER now;
+  status = QueryPerformanceCounter(&now);
+  *t = ((double)now.QuadPart) * __kmp_win32_tick;
+}
+
+/* Calculate the elapsed wall clock tick for the user */
+
+void __kmp_elapsed_tick(double *t) { *t = __kmp_win32_tick; }
+
+void __kmp_read_system_time(double *delta) {
+  if (delta != NULL) {
+    BOOL status;
+    LARGE_INTEGER now;
+
+    status = QueryPerformanceCounter(&now);
+
+    *delta = ((double)(((kmp_int64)now.QuadPart) - __kmp_win32_time)) *
+             __kmp_win32_tick;
+  }
+}
+
+/* Return the current time stamp in nsec */
+kmp_uint64 __kmp_now_nsec() {
+  LARGE_INTEGER now;
+  QueryPerformanceCounter(&now);
+  return 1e9 * __kmp_win32_tick * now.QuadPart;
+}
+
+extern "C"
+void *__stdcall __kmp_launch_worker(void *arg) {
+  volatile void *stack_data;
+  void *exit_val;
+  void *padding = 0;
+  kmp_info_t *this_thr = (kmp_info_t *)arg;
+  int gtid;
+
+  gtid = this_thr->th.th_info.ds.ds_gtid;
+  __kmp_gtid_set_specific(gtid);
+#ifdef KMP_TDATA_GTID
+#error "This define causes problems with LoadLibrary() + declspec(thread) " \
+        "on Windows* OS.  See CQ50564, tests kmp_load_library*.c and this MSDN " \
+        "reference: http://support.microsoft.com/kb/118816"
+//__kmp_gtid = gtid;
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_thread_name(gtid);
+#endif /* USE_ITT_BUILD */
+
+  __kmp_affinity_set_init_mask(gtid, FALSE);
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+  // Set FP control regs to be a copy of the parallel initialization thread's.
+  __kmp_clear_x87_fpu_status_word();
+  __kmp_load_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
+  __kmp_load_mxcsr(&__kmp_init_mxcsr);
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+  if (__kmp_stkoffset > 0 && gtid > 0) {
+    padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
+  }
+
+  KMP_FSYNC_RELEASING(&this_thr->th.th_info.ds.ds_alive);
+  this_thr->th.th_info.ds.ds_thread_id = GetCurrentThreadId();
+  TCW_4(this_thr->th.th_info.ds.ds_alive, TRUE);
+
+  if (TCR_4(__kmp_gtid_mode) <
+      2) { // check stack only if it is used to get gtid
+    TCW_PTR(this_thr->th.th_info.ds.ds_stackbase, &stack_data);
+    KMP_ASSERT(this_thr->th.th_info.ds.ds_stackgrow == FALSE);
+    __kmp_check_stack_overlap(this_thr);
+  }
+  KMP_MB();
+  exit_val = __kmp_launch_thread(this_thr);
+  KMP_FSYNC_RELEASING(&this_thr->th.th_info.ds.ds_alive);
+  TCW_4(this_thr->th.th_info.ds.ds_alive, FALSE);
+  KMP_MB();
+  return exit_val;
+}
+
+#if KMP_USE_MONITOR
+/* The monitor thread controls all of the threads in the complex */
+
+void *__stdcall __kmp_launch_monitor(void *arg) {
+  DWORD wait_status;
+  kmp_thread_t monitor;
+  int status;
+  int interval;
+  kmp_info_t *this_thr = (kmp_info_t *)arg;
+
+  KMP_DEBUG_ASSERT(__kmp_init_monitor);
+  TCW_4(__kmp_init_monitor, 2); // AC: Signal library that monitor has started
+  // TODO: hide "2" in enum (like {true,false,started})
+  this_thr->th.th_info.ds.ds_thread_id = GetCurrentThreadId();
+  TCW_4(this_thr->th.th_info.ds.ds_alive, TRUE);
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  KA_TRACE(10, ("__kmp_launch_monitor: launched\n"));
+
+  monitor = GetCurrentThread();
+
+  /* set thread priority */
+  status = SetThreadPriority(monitor, THREAD_PRIORITY_HIGHEST);
+  if (!status) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantSetThreadPriority), KMP_ERR(error), __kmp_msg_null);
+  }
+
+  /* register us as monitor */
+  __kmp_gtid_set_specific(KMP_GTID_MONITOR);
+#ifdef KMP_TDATA_GTID
+#error "This define causes problems with LoadLibrary() + declspec(thread) " \
+        "on Windows* OS.  See CQ50564, tests kmp_load_library*.c and this MSDN " \
+        "reference: http://support.microsoft.com/kb/118816"
+//__kmp_gtid = KMP_GTID_MONITOR;
+#endif
+
+#if USE_ITT_BUILD
+  __kmp_itt_thread_ignore(); // Instruct Intel(R) Threading Tools to ignore
+// monitor thread.
+#endif /* USE_ITT_BUILD */
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  interval = (1000 / __kmp_monitor_wakeups); /* in milliseconds */
+
+  while (!TCR_4(__kmp_global.g.g_done)) {
+    /*  This thread monitors the state of the system */
+
+    KA_TRACE(15, ("__kmp_launch_monitor: update\n"));
+
+    wait_status = WaitForSingleObject(__kmp_monitor_ev, interval);
+
+    if (wait_status == WAIT_TIMEOUT) {
+      TCW_4(__kmp_global.g.g_time.dt.t_value,
+            TCR_4(__kmp_global.g.g_time.dt.t_value) + 1);
+    }
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+
+  KA_TRACE(10, ("__kmp_launch_monitor: finished\n"));
+
+  status = SetThreadPriority(monitor, THREAD_PRIORITY_NORMAL);
+  if (!status) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantSetThreadPriority), KMP_ERR(error), __kmp_msg_null);
+  }
+
+  if (__kmp_global.g.g_abort != 0) {
+    /* now we need to terminate the worker threads   */
+    /* the value of t_abort is the signal we caught */
+    int gtid;
+
+    KA_TRACE(10, ("__kmp_launch_monitor: terminate sig=%d\n",
+                  (__kmp_global.g.g_abort)));
+
+    /* terminate the OpenMP worker threads */
+    /* TODO this is not valid for sibling threads!!
+     * the uber master might not be 0 anymore.. */
+    for (gtid = 1; gtid < __kmp_threads_capacity; ++gtid)
+      __kmp_terminate_thread(gtid);
+
+    __kmp_cleanup();
+
+    Sleep(0);
+
+    KA_TRACE(10,
+             ("__kmp_launch_monitor: raise sig=%d\n", __kmp_global.g.g_abort));
+
+    if (__kmp_global.g.g_abort > 0) {
+      raise(__kmp_global.g.g_abort);
+    }
+  }
+
+  TCW_4(this_thr->th.th_info.ds.ds_alive, FALSE);
+
+  KMP_MB();
+  return arg;
+}
+#endif
+
+void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
+  kmp_thread_t handle;
+  DWORD idThread;
+
+  KA_TRACE(10, ("__kmp_create_worker: try to create thread (%d)\n", gtid));
+
+  th->th.th_info.ds.ds_gtid = gtid;
+
+  if (KMP_UBER_GTID(gtid)) {
+    int stack_data;
+
+    /* TODO: GetCurrentThread() returns a pseudo-handle that is unsuitable for
+       other threads to use. Is it appropriate to just use GetCurrentThread?
+       When should we close this handle?  When unregistering the root? */
+    {
+      BOOL rc;
+      rc = DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),
+                           GetCurrentProcess(), &th->th.th_info.ds.ds_thread, 0,
+                           FALSE, DUPLICATE_SAME_ACCESS);
+      KMP_ASSERT(rc);
+      KA_TRACE(10, (" __kmp_create_worker: ROOT Handle duplicated, th = %p, "
+                    "handle = %" KMP_UINTPTR_SPEC "\n",
+                    (LPVOID)th, th->th.th_info.ds.ds_thread));
+      th->th.th_info.ds.ds_thread_id = GetCurrentThreadId();
+    }
+    if (TCR_4(__kmp_gtid_mode) < 2) { // check stack only if used to get gtid
+      /* we will dynamically update the stack range if gtid_mode == 1 */
+      TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
+      TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
+      TCW_4(th->th.th_info.ds.ds_stackgrow, TRUE);
+      __kmp_check_stack_overlap(th);
+    }
+  } else {
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    /* Set stack size for this thread now. */
+    KA_TRACE(10,
+             ("__kmp_create_worker: stack_size = %" KMP_SIZE_T_SPEC " bytes\n",
+              stack_size));
+
+    stack_size += gtid * __kmp_stkoffset;
+
+    TCW_PTR(th->th.th_info.ds.ds_stacksize, stack_size);
+    TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
+
+    KA_TRACE(10,
+             ("__kmp_create_worker: (before) stack_size = %" KMP_SIZE_T_SPEC
+              " bytes, &__kmp_launch_worker = %p, th = %p, &idThread = %p\n",
+              (SIZE_T)stack_size, (LPTHREAD_START_ROUTINE)&__kmp_launch_worker,
+              (LPVOID)th, &idThread));
+
+    handle = CreateThread(
+        NULL, (SIZE_T)stack_size, (LPTHREAD_START_ROUTINE)__kmp_launch_worker,
+        (LPVOID)th, STACK_SIZE_PARAM_IS_A_RESERVATION, &idThread);
+
+    KA_TRACE(10,
+             ("__kmp_create_worker: (after) stack_size = %" KMP_SIZE_T_SPEC
+              " bytes, &__kmp_launch_worker = %p, th = %p, "
+              "idThread = %u, handle = %" KMP_UINTPTR_SPEC "\n",
+              (SIZE_T)stack_size, (LPTHREAD_START_ROUTINE)&__kmp_launch_worker,
+              (LPVOID)th, idThread, handle));
+
+    if (handle == 0) {
+      DWORD error = GetLastError();
+      __kmp_fatal(KMP_MSG(CantCreateThread), KMP_ERR(error), __kmp_msg_null);
+    } else {
+      th->th.th_info.ds.ds_thread = handle;
+    }
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+
+  KA_TRACE(10, ("__kmp_create_worker: done creating thread (%d)\n", gtid));
+}
+
+int __kmp_still_running(kmp_info_t *th) {
+  return (WAIT_TIMEOUT == WaitForSingleObject(th->th.th_info.ds.ds_thread, 0));
+}
+
+#if KMP_USE_MONITOR
+void __kmp_create_monitor(kmp_info_t *th) {
+  kmp_thread_t handle;
+  DWORD idThread;
+  int ideal, new_ideal;
+
+  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+    // We don't need monitor thread in case of MAX_BLOCKTIME
+    KA_TRACE(10, ("__kmp_create_monitor: skipping monitor thread because of "
+                  "MAX blocktime\n"));
+    th->th.th_info.ds.ds_tid = 0; // this makes reap_monitor no-op
+    th->th.th_info.ds.ds_gtid = 0;
+    TCW_4(__kmp_init_monitor, 2); // Signal to stop waiting for monitor creation
+    return;
+  }
+  KA_TRACE(10, ("__kmp_create_monitor: try to create monitor\n"));
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  __kmp_monitor_ev = CreateEvent(NULL, TRUE, FALSE, NULL);
+  if (__kmp_monitor_ev == NULL) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantCreateEvent), KMP_ERR(error), __kmp_msg_null);
+  }
+#if USE_ITT_BUILD
+  __kmp_itt_system_object_created(__kmp_monitor_ev, "Event");
+#endif /* USE_ITT_BUILD */
+
+  th->th.th_info.ds.ds_tid = KMP_GTID_MONITOR;
+  th->th.th_info.ds.ds_gtid = KMP_GTID_MONITOR;
+
+  // FIXME - on Windows* OS, if __kmp_monitor_stksize = 0, figure out how
+  // to automatically expand stacksize based on CreateThread error code.
+  if (__kmp_monitor_stksize == 0) {
+    __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
+  }
+  if (__kmp_monitor_stksize < __kmp_sys_min_stksize) {
+    __kmp_monitor_stksize = __kmp_sys_min_stksize;
+  }
+
+  KA_TRACE(10, ("__kmp_create_monitor: requested stacksize = %d bytes\n",
+                (int)__kmp_monitor_stksize));
+
+  TCW_4(__kmp_global.g.g_time.dt.t_value, 0);
+
+  handle =
+      CreateThread(NULL, (SIZE_T)__kmp_monitor_stksize,
+                   (LPTHREAD_START_ROUTINE)__kmp_launch_monitor, (LPVOID)th,
+                   STACK_SIZE_PARAM_IS_A_RESERVATION, &idThread);
+  if (handle == 0) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantCreateThread), KMP_ERR(error), __kmp_msg_null);
+  } else
+    th->th.th_info.ds.ds_thread = handle;
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(10, ("__kmp_create_monitor: monitor created %p\n",
+                (void *)th->th.th_info.ds.ds_thread));
+}
+#endif
+
+/* Check to see if thread is still alive.
+   NOTE:  The ExitProcess(code) system call causes all threads to Terminate
+   with a exit_val = code.  Because of this we can not rely on exit_val having
+   any particular value.  So this routine may return STILL_ALIVE in exit_val
+   even after the thread is dead. */
+
+int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val) {
+  DWORD rc;
+  rc = GetExitCodeThread(th->th.th_info.ds.ds_thread, exit_val);
+  if (rc == 0) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(FunctionError, "GetExitCodeThread()"), KMP_ERR(error),
+                __kmp_msg_null);
+  }
+  return (*exit_val == STILL_ACTIVE);
+}
+
+void __kmp_exit_thread(int exit_status) {
+  ExitThread(exit_status);
+} // __kmp_exit_thread
+
+// This is a common part for both __kmp_reap_worker() and __kmp_reap_monitor().
+static void __kmp_reap_common(kmp_info_t *th) {
+  DWORD exit_val;
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  KA_TRACE(
+      10, ("__kmp_reap_common: try to reap (%d)\n", th->th.th_info.ds.ds_gtid));
+
+  /* 2006-10-19:
+     There are two opposite situations:
+     1. Windows* OS keep thread alive after it resets ds_alive flag and
+     exits from thread function. (For example, see C70770/Q394281 "unloading of
+     dll based on OMP is very slow".)
+     2. Windows* OS may kill thread before it resets ds_alive flag.
+
+     Right solution seems to be waiting for *either* thread termination *or*
+     ds_alive resetting. */
+  {
+    // TODO: This code is very similar to KMP_WAIT. Need to generalize
+    // KMP_WAIT to cover this usage also.
+    void *obj = NULL;
+    kmp_uint32 spins;
+#if USE_ITT_BUILD
+    KMP_FSYNC_SPIN_INIT(obj, (void *)&th->th.th_info.ds.ds_alive);
+#endif /* USE_ITT_BUILD */
+    KMP_INIT_YIELD(spins);
+    do {
+#if USE_ITT_BUILD
+      KMP_FSYNC_SPIN_PREPARE(obj);
+#endif /* USE_ITT_BUILD */
+      __kmp_is_thread_alive(th, &exit_val);
+      KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
+    } while (exit_val == STILL_ACTIVE && TCR_4(th->th.th_info.ds.ds_alive));
+#if USE_ITT_BUILD
+    if (exit_val == STILL_ACTIVE) {
+      KMP_FSYNC_CANCEL(obj);
+    } else {
+      KMP_FSYNC_SPIN_ACQUIRED(obj);
+    }
+#endif /* USE_ITT_BUILD */
+  }
+
+  __kmp_free_handle(th->th.th_info.ds.ds_thread);
+
+  /* NOTE:  The ExitProcess(code) system call causes all threads to Terminate
+     with a exit_val = code.  Because of this we can not rely on exit_val having
+     any particular value. */
+  if (exit_val == STILL_ACTIVE) {
+    KA_TRACE(1, ("__kmp_reap_common: thread still active.\n"));
+  } else if ((void *)exit_val != (void *)th) {
+    KA_TRACE(1, ("__kmp_reap_common: ExitProcess / TerminateThread used?\n"));
+  }
+
+  KA_TRACE(10,
+           ("__kmp_reap_common: done reaping (%d), handle = %" KMP_UINTPTR_SPEC
+            "\n",
+            th->th.th_info.ds.ds_gtid, th->th.th_info.ds.ds_thread));
+
+  th->th.th_info.ds.ds_thread = 0;
+  th->th.th_info.ds.ds_tid = KMP_GTID_DNE;
+  th->th.th_info.ds.ds_gtid = KMP_GTID_DNE;
+  th->th.th_info.ds.ds_thread_id = 0;
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+
+#if KMP_USE_MONITOR
+void __kmp_reap_monitor(kmp_info_t *th) {
+  int status;
+
+  KA_TRACE(10, ("__kmp_reap_monitor: try to reap %p\n",
+                (void *)th->th.th_info.ds.ds_thread));
+
+  // If monitor has been created, its tid and gtid should be KMP_GTID_MONITOR.
+  // If both tid and gtid are 0, it means the monitor did not ever start.
+  // If both tid and gtid are KMP_GTID_DNE, the monitor has been shut down.
+  KMP_DEBUG_ASSERT(th->th.th_info.ds.ds_tid == th->th.th_info.ds.ds_gtid);
+  if (th->th.th_info.ds.ds_gtid != KMP_GTID_MONITOR) {
+    KA_TRACE(10, ("__kmp_reap_monitor: monitor did not start, returning\n"));
+    return;
+  }
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+  status = SetEvent(__kmp_monitor_ev);
+  if (status == FALSE) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantSetEvent), KMP_ERR(error), __kmp_msg_null);
+  }
+  KA_TRACE(10, ("__kmp_reap_monitor: reaping thread (%d)\n",
+                th->th.th_info.ds.ds_gtid));
+  __kmp_reap_common(th);
+
+  __kmp_free_handle(__kmp_monitor_ev);
+
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+}
+#endif
+
+void __kmp_reap_worker(kmp_info_t *th) {
+  KA_TRACE(10, ("__kmp_reap_worker: reaping thread (%d)\n",
+                th->th.th_info.ds.ds_gtid));
+  __kmp_reap_common(th);
+}
+
+#if KMP_HANDLE_SIGNALS
+
+static void __kmp_team_handler(int signo) {
+  if (__kmp_global.g.g_abort == 0) {
+    // Stage 1 signal handler, let's shut down all of the threads.
+    if (__kmp_debug_buf) {
+      __kmp_dump_debug_buffer();
+    }
+    KMP_MB(); // Flush all pending memory write invalidates.
+    TCW_4(__kmp_global.g.g_abort, signo);
+    KMP_MB(); // Flush all pending memory write invalidates.
+    TCW_4(__kmp_global.g.g_done, TRUE);
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+} // __kmp_team_handler
+
+static sig_func_t __kmp_signal(int signum, sig_func_t handler) {
+  sig_func_t old = signal(signum, handler);
+  if (old == SIG_ERR) {
+    int error = errno;
+    __kmp_fatal(KMP_MSG(FunctionError, "signal"), KMP_ERR(error),
+                __kmp_msg_null);
+  }
+  return old;
+}
+
+static void __kmp_install_one_handler(int sig, sig_func_t handler,
+                                      int parallel_init) {
+  sig_func_t old;
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+  KB_TRACE(60, ("__kmp_install_one_handler: called: sig=%d\n", sig));
+  if (parallel_init) {
+    old = __kmp_signal(sig, handler);
+    // SIG_DFL on Windows* OS in NULL or 0.
+    if (old == __kmp_sighldrs[sig]) {
+      __kmp_siginstalled[sig] = 1;
+    } else { // Restore/keep user's handler if one previously installed.
+      old = __kmp_signal(sig, old);
+    }
+  } else {
+    // Save initial/system signal handlers to see if user handlers installed.
+    // 2009-09-23: It is a dead code. On Windows* OS __kmp_install_signals
+    // called once with parallel_init == TRUE.
+    old = __kmp_signal(sig, SIG_DFL);
+    __kmp_sighldrs[sig] = old;
+    __kmp_signal(sig, old);
+  }
+  KMP_MB(); /* Flush all pending memory write invalidates.  */
+} // __kmp_install_one_handler
+
+static void __kmp_remove_one_handler(int sig) {
+  if (__kmp_siginstalled[sig]) {
+    sig_func_t old;
+    KMP_MB(); // Flush all pending memory write invalidates.
+    KB_TRACE(60, ("__kmp_remove_one_handler: called: sig=%d\n", sig));
+    old = __kmp_signal(sig, __kmp_sighldrs[sig]);
+    if (old != __kmp_team_handler) {
+      KB_TRACE(10, ("__kmp_remove_one_handler: oops, not our handler, "
+                    "restoring: sig=%d\n",
+                    sig));
+      old = __kmp_signal(sig, old);
+    }
+    __kmp_sighldrs[sig] = NULL;
+    __kmp_siginstalled[sig] = 0;
+    KMP_MB(); // Flush all pending memory write invalidates.
+  }
+} // __kmp_remove_one_handler
+
+void __kmp_install_signals(int parallel_init) {
+  KB_TRACE(10, ("__kmp_install_signals: called\n"));
+  if (!__kmp_handle_signals) {
+    KB_TRACE(10, ("__kmp_install_signals: KMP_HANDLE_SIGNALS is false - "
+                  "handlers not installed\n"));
+    return;
+  }
+  __kmp_install_one_handler(SIGINT, __kmp_team_handler, parallel_init);
+  __kmp_install_one_handler(SIGILL, __kmp_team_handler, parallel_init);
+  __kmp_install_one_handler(SIGABRT, __kmp_team_handler, parallel_init);
+  __kmp_install_one_handler(SIGFPE, __kmp_team_handler, parallel_init);
+  __kmp_install_one_handler(SIGSEGV, __kmp_team_handler, parallel_init);
+  __kmp_install_one_handler(SIGTERM, __kmp_team_handler, parallel_init);
+} // __kmp_install_signals
+
+void __kmp_remove_signals(void) {
+  int sig;
+  KB_TRACE(10, ("__kmp_remove_signals: called\n"));
+  for (sig = 1; sig < NSIG; ++sig) {
+    __kmp_remove_one_handler(sig);
+  }
+} // __kmp_remove_signals
+
+#endif // KMP_HANDLE_SIGNALS
+
+/* Put the thread to sleep for a time period */
+void __kmp_thread_sleep(int millis) {
+  DWORD status;
+
+  status = SleepEx((DWORD)millis, FALSE);
+  if (status) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(FunctionError, "SleepEx()"), KMP_ERR(error),
+                __kmp_msg_null);
+  }
+}
+
+// Determine whether the given address is mapped into the current address space.
+int __kmp_is_address_mapped(void *addr) {
+  DWORD status;
+  MEMORY_BASIC_INFORMATION lpBuffer;
+  SIZE_T dwLength;
+
+  dwLength = sizeof(MEMORY_BASIC_INFORMATION);
+
+  status = VirtualQuery(addr, &lpBuffer, dwLength);
+
+  return !(((lpBuffer.State == MEM_RESERVE) || (lpBuffer.State == MEM_FREE)) ||
+           ((lpBuffer.Protect == PAGE_NOACCESS) ||
+            (lpBuffer.Protect == PAGE_EXECUTE)));
+}
+
+kmp_uint64 __kmp_hardware_timestamp(void) {
+  kmp_uint64 r = 0;
+
+  QueryPerformanceCounter((LARGE_INTEGER *)&r);
+  return r;
+}
+
+/* Free handle and check the error code */
+void __kmp_free_handle(kmp_thread_t tHandle) {
+  /* called with parameter type HANDLE also, thus suppose kmp_thread_t defined
+   * as HANDLE */
+  BOOL rc;
+  rc = CloseHandle(tHandle);
+  if (!rc) {
+    DWORD error = GetLastError();
+    __kmp_fatal(KMP_MSG(CantCloseHandle), KMP_ERR(error), __kmp_msg_null);
+  }
+}
+
+int __kmp_get_load_balance(int max) {
+  static ULONG glb_buff_size = 100 * 1024;
+
+  // Saved count of the running threads for the thread balance algortihm
+  static int glb_running_threads = 0;
+  static double glb_call_time = 0; /* Thread balance algorithm call time */
+
+  int running_threads = 0; // Number of running threads in the system.
+  NTSTATUS status = 0;
+  ULONG buff_size = 0;
+  ULONG info_size = 0;
+  void *buffer = NULL;
+  PSYSTEM_PROCESS_INFORMATION spi = NULL;
+  int first_time = 1;
+
+  double call_time = 0.0; // start, finish;
+
+  __kmp_elapsed(&call_time);
+
+  if (glb_call_time &&
+      (call_time - glb_call_time < __kmp_load_balance_interval)) {
+    running_threads = glb_running_threads;
+    goto finish;
+  }
+  glb_call_time = call_time;
+
+  // Do not spend time on running algorithm if we have a permanent error.
+  if (NtQuerySystemInformation == NULL) {
+    running_threads = -1;
+    goto finish;
+  }
+
+  if (max <= 0) {
+    max = INT_MAX;
+  }
+
+  do {
+
+    if (first_time) {
+      buff_size = glb_buff_size;
+    } else {
+      buff_size = 2 * buff_size;
+    }
+
+    buffer = KMP_INTERNAL_REALLOC(buffer, buff_size);
+    if (buffer == NULL) {
+      running_threads = -1;
+      goto finish;
+    }
+    status = NtQuerySystemInformation(SystemProcessInformation, buffer,
+                                      buff_size, &info_size);
+    first_time = 0;
+
+  } while (status == STATUS_INFO_LENGTH_MISMATCH);
+  glb_buff_size = buff_size;
+
+#define CHECK(cond)                                                            \
+  {                                                                            \
+    KMP_DEBUG_ASSERT(cond);                                                    \
+    if (!(cond)) {                                                             \
+      running_threads = -1;                                                    \
+      goto finish;                                                             \
+    }                                                                          \
+  }
+
+  CHECK(buff_size >= info_size);
+  spi = PSYSTEM_PROCESS_INFORMATION(buffer);
+  for (;;) {
+    ptrdiff_t offset = uintptr_t(spi) - uintptr_t(buffer);
+    CHECK(0 <= offset &&
+          offset + sizeof(SYSTEM_PROCESS_INFORMATION) < info_size);
+    HANDLE pid = spi->ProcessId;
+    ULONG num = spi->NumberOfThreads;
+    CHECK(num >= 1);
+    size_t spi_size =
+        sizeof(SYSTEM_PROCESS_INFORMATION) + sizeof(SYSTEM_THREAD) * (num - 1);
+    CHECK(offset + spi_size <
+          info_size); // Make sure process info record fits the buffer.
+    if (spi->NextEntryOffset != 0) {
+      CHECK(spi_size <=
+            spi->NextEntryOffset); // And do not overlap with the next record.
+    }
+    // pid == 0 corresponds to the System Idle Process. It always has running
+    // threads on all cores. So, we don't consider the running threads of this
+    // process.
+    if (pid != 0) {
+      for (int i = 0; i < num; ++i) {
+        THREAD_STATE state = spi->Threads[i].State;
+        // Count threads that have Ready or Running state.
+        // !!! TODO: Why comment does not match the code???
+        if (state == StateRunning) {
+          ++running_threads;
+          // Stop counting running threads if the number is already greater than
+          // the number of available cores
+          if (running_threads >= max) {
+            goto finish;
+          }
+        }
+      }
+    }
+    if (spi->NextEntryOffset == 0) {
+      break;
+    }
+    spi = PSYSTEM_PROCESS_INFORMATION(uintptr_t(spi) + spi->NextEntryOffset);
+  }
+
+#undef CHECK
+
+finish: // Clean up and exit.
+
+  if (buffer != NULL) {
+    KMP_INTERNAL_FREE(buffer);
+  }
+
+  glb_running_threads = running_threads;
+
+  return running_threads;
+} //__kmp_get_load_balance()
diff --git a/final/runtime/test/CMakeLists.txt b/final/runtime/test/CMakeLists.txt
new file mode 100644
index 0000000..851377f
--- /dev/null
+++ b/final/runtime/test/CMakeLists.txt
@@ -0,0 +1,40 @@
+# CMakeLists.txt file for unit testing OpenMP host runtime library.
+include(CheckFunctionExists)
+include(CheckLibraryExists)
+
+# Some tests use math functions
+check_library_exists(m sqrt "" LIBOMP_HAVE_LIBM)
+# When using libgcc, -latomic may be needed for atomics
+# (but when using compiler-rt, the atomics will be built-in)
+# Note: we can not check for __atomic_load because clang treats it
+# as special built-in and that breaks CMake checks
+check_function_exists(__atomic_load_1 LIBOMP_HAVE_BUILTIN_ATOMIC)
+if(NOT LIBOMP_HAVE_BUILTIN_ATOMIC)
+  check_library_exists(atomic __atomic_load_1 "" LIBOMP_HAVE_LIBATOMIC)
+else()
+  # not needed
+  set(LIBOMP_HAVE_LIBATOMIC 0)
+endif()
+
+macro(pythonize_bool var)
+  if (${var})
+    set(${var} True)
+  else()
+    set(${var} False)
+  endif()
+endmacro()
+
+pythonize_bool(LIBOMP_USE_HWLOC)
+pythonize_bool(LIBOMP_OMPT_SUPPORT)
+pythonize_bool(LIBOMP_OMPT_OPTIONAL)
+pythonize_bool(LIBOMP_HAVE_LIBM)
+pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
+
+add_openmp_testsuite(check-libomp "Running libomp tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omp)
+# Add target check-ompt, but make sure to not add the tests twice to check-openmp.
+set(EXCLUDE_FROM_ALL True)
+add_openmp_testsuite(check-ompt "Running OMPT tests" ${CMAKE_CURRENT_BINARY_DIR}/ompt DEPENDS omp)
+
+# Configure the lit.site.cfg.in file
+set(AUTO_GEN_COMMENT "## Autogenerated by libomp configuration.\n# Do not edit!")
+configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
diff --git a/final/runtime/test/affinity/bug-nested.c b/final/runtime/test/affinity/bug-nested.c
new file mode 100644
index 0000000..4396182
--- /dev/null
+++ b/final/runtime/test/affinity/bug-nested.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile && env KMP_AFFINITY=compact %libomp-run
+
+#include <stdio.h>
+#include <stdint.h>
+#include <omp.h>
+#include "omp_testsuite.h"
+
+int test_nested_affinity_bug() {
+  int a = 0;
+  omp_set_nested(1);
+  #pragma omp parallel num_threads(2) shared(a)
+  {
+    #pragma omp parallel num_threads(2) shared(a) proc_bind(close)
+    {
+      #pragma omp atomic
+      a++;
+    }
+  }
+  return 1;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_nested_affinity_bug()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/affinity/format/affinity_display.1.c b/final/runtime/test/affinity/format/affinity_display.1.c
new file mode 100644
index 0000000..fe357d3
--- /dev/null
+++ b/final/runtime/test/affinity/format/affinity_display.1.c
@@ -0,0 +1,92 @@
+// RUN: %libomp-compile
+// RUN: env OMP_DISPLAY_AFFINITY=TRUE OMP_NUM_THREADS=4 OMP_PLACES='{0,1},{2,3},{4,5},{6,7}' %libomp-run | %python %S/check.py -c 'CHECK' %s
+
+// Affinity Display examples
+#include <stdio.h>
+#include <stdlib.h> // also null is in <stddef.h>
+#include <stddef.h>
+#include <omp.h>
+#include <string.h>
+
+// ENVIRONMENT
+// OMP_DISPLAY_AFFINITY=TRUE
+// OMP_NUM_THREADS=4
+// OMP_PLACES='{0,1},{2,3},{4,5},{6,7}'
+
+// CHECK: num_threads=1 OMP: pid [0-9]+ tid [0-9]+ thread [0-4] bound to OS proc set \{([0-7])|(0,1)|(undefined)\}
+// CHECK: num_threads=4 Thread id [0-3] reporting in
+// CHECK: num_threads=4 OMP: pid [0-9]+ tid [0-9]+ thread [0-4] bound to OS proc set \{([0-7])|([0246],[1357])|(undefined)\}
+// CHECK: num_threads=1 Default Affinity Format is:
+// CHECK: num_threads=1 Affinity Format set to: host=%20H tid=%0.4n binds_to=%A
+// CHECK: num_threads=4 tid=[0-3] affinity:host=[a-zA-Z0-9_.-]+[ ]+tid=000[0-4][ ]+binds_to=(([0-7])|([0246],[1357])|(undefined))
+
+#define FORMAT_STORE 80
+#define BUFFER_STORE 80
+
+int main(int argc, char** argv) {
+  int i, n, tid, max_req_store = 0;
+  size_t nchars;
+  char default_format[FORMAT_STORE];
+  char my_format[] = "host=%20H tid=%0.4n binds_to=%A";
+  char **buffer;
+
+  // CODE SEGMENT 1 AFFINITY DISPLAY
+  omp_display_affinity(NULL);
+
+  // OMP_DISPLAY_AFFINITY=TRUE,
+  // Affinity reported for 1 parallel region
+  #pragma omp parallel
+  {
+    printf("Thread id %d reporting in.\n", omp_get_thread_num());
+  }
+
+  // Get and Display Default Affinity Format
+  nchars = omp_get_affinity_format(default_format, (size_t)FORMAT_STORE);
+  printf("Default Affinity Format is: %s\n", default_format);
+
+  if (nchars > FORMAT_STORE) {
+    printf("Caution: Reported Format is truncated. Increase\n");
+    printf(" FORMAT_STORE by %d.\n", (int)nchars - FORMAT_STORE);
+  }
+
+  // Set Affinity Format
+  omp_set_affinity_format(my_format);
+  printf("Affinity Format set to: %s\n", my_format);
+
+  // CODE SEGMENT 3 CAPTURE AFFINITY
+  // Set up buffer for affinity of n threads
+  n = omp_get_max_threads();
+  buffer = (char **)malloc(sizeof(char *) * n);
+  for (i = 0; i < n; i++) {
+    buffer[i] = (char *)malloc(sizeof(char) * BUFFER_STORE);
+  }
+
+  // Capture Affinity using Affinity Format set above.
+  // Use critical reduction to check size of buffer areas
+  #pragma omp parallel private(tid, nchars)
+  {
+    tid = omp_get_thread_num();
+    nchars = omp_capture_affinity(buffer[tid], (size_t)BUFFER_STORE, NULL);
+    #pragma omp critical
+    {
+      if (nchars > max_req_store)
+        max_req_store = nchars;
+    }
+  }
+
+  for (i = 0; i < n; i++) {
+    printf("tid=%d affinity:%s:\n", i, buffer[i]);
+  }
+  // for 4 threads with OMP_PLACES='{0,1},{2,3},{4,5},{6,7}'
+  // host=%20H tid=%0.4n binds_to=%A
+  // host=<hostname> tid=0000 binds_to=0,1
+  // host=<hostname> tid=0001 binds_to=2,3
+  // host=<hostname> tid=0002 binds_to=4,5
+  // host=<hostname> tid=0003 binds_to=6,7
+
+  if (max_req_store > BUFFER_STORE) {
+    printf("Caution: Affinity string truncated. Increase\n");
+    printf(" BUFFER_STORE by %d\n", max_req_store - BUFFER_STORE);
+  }
+  return 0;
+}
diff --git a/final/runtime/test/affinity/format/affinity_values.c b/final/runtime/test/affinity/format/affinity_values.c
new file mode 100644
index 0000000..37ab210
--- /dev/null
+++ b/final/runtime/test/affinity/format/affinity_values.c
@@ -0,0 +1,135 @@
+// RUN: %libomp-compile
+// RUN: env OMP_PROC_BIND=close OMP_PLACES=threads %libomp-run
+// RUN: env OMP_PROC_BIND=close OMP_PLACES=cores %libomp-run
+// RUN: env OMP_PROC_BIND=close OMP_PLACES=sockets %libomp-run
+// RUN: env KMP_AFFINITY=compact %libomp-run
+// RUN: env KMP_AFFINITY=scatter %libomp-run
+// REQUIRES: affinity
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <omp.h>
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
+
+#define streqls(s1, s2) (!strcmp(s1, s2))
+
+#define check(condition)                                                       \
+  if (!(condition)) {                                                          \
+    fprintf(stderr, "error: %s: %d: " STR(condition) "\n", __FILE__,           \
+            __LINE__);                                                         \
+    exit(1);                                                                   \
+  }
+
+#define DEBUG 0
+
+#if DEBUG
+#include <stdarg.h>
+#endif
+
+#define BUFFER_SIZE 1024
+
+char buf[BUFFER_SIZE];
+#pragma omp threadprivate(buf)
+
+static int debug_printf(const char* format, ...) {
+  int retval = 0;
+#if DEBUG
+  va_list args;
+  va_start(args, format);
+  retval = vprintf(format, args);
+  va_end(args);
+#endif
+  return retval;
+}
+
+static void display_affinity_environment() {
+#if DEBUG
+  printf("Affinity Environment:\n");
+  printf("  OMP_PROC_BIND=%s\n", getenv("OMP_PROC_BIND"));
+  printf("  OMP_PLACES=%s\n", getenv("OMP_PLACES"));
+  printf("  KMP_AFFINITY=%s\n", getenv("KMP_AFFINITY"));
+#endif
+}
+
+// Reads in a list of integers into ids array (not going past ids_size)
+// e.g., if affinity = "0-4,6,8-10,14,16,17-20,23"
+//       then ids = [0,1,2,3,4,6,8,9,10,14,16,17,18,19,20,23]
+void list_to_ids(const char* affinity, int* ids, int ids_size) {
+  int id, b, e, ids_index;
+  char *aff, *begin, *end, *absolute_end;
+  aff = strdup(affinity);
+  absolute_end = aff + strlen(aff);
+  ids_index = 0;
+  begin = end = aff;
+  while (end < absolute_end) {
+    end = begin;
+    while (*end != '\0' && *end != ',')
+      end++;
+    *end = '\0';
+    if (strchr(begin, '-') != NULL) {
+      // Range
+      sscanf(begin, "%d-%d", &b, &e);
+    } else {
+      // Single Number
+      sscanf(begin, "%d", &b);
+      e = b;
+    }
+    for (id = b; id <= e; ++id) {
+      ids[ids_index++] = id;
+      if (ids_index >= ids_size) {
+        free(aff);
+        return;
+      }
+    }
+    begin = end + 1;
+  }
+  free(aff);
+}
+
+void check_thread_affinity() {
+  int i;
+  const char *formats[2] = {"%{thread_affinity}", "%A"};
+  for (i = 0; i < sizeof(formats) / sizeof(formats[0]); ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel
+    {
+      int j, k;
+      int place = omp_get_place_num();
+      int num_procs = omp_get_place_num_procs(place);
+      int *ids = (int *)malloc(sizeof(int) * num_procs);
+      int *ids2 = (int *)malloc(sizeof(int) * num_procs);
+      char buf[256];
+      size_t n = omp_capture_affinity(buf, 256, NULL);
+      check(n <= 256);
+      omp_get_place_proc_ids(place, ids);
+      list_to_ids(buf, ids2, num_procs);
+
+      #pragma omp for schedule(static) ordered
+      for (k = 0; k < omp_get_num_threads(); ++k) {
+        #pragma omp ordered
+        {
+          debug_printf("Thread %d: captured affinity = %s\n",
+                       omp_get_thread_num(), buf);
+          for (j = 0; j < num_procs; ++j) {
+            debug_printf("Thread %d: ids[%d] = %d ids2[%d] = %d\n",
+                         omp_get_thread_num(), j, ids[j], j, ids2[j]);
+            check(ids[j] == ids2[j]);
+          }
+        }
+      }
+
+      free(ids);
+      free(ids2);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  omp_set_nested(1);
+  display_affinity_environment();
+  check_thread_affinity();
+  return 0;
+}
diff --git a/final/runtime/test/affinity/format/api.c b/final/runtime/test/affinity/format/api.c
new file mode 100644
index 0000000..08805e7
--- /dev/null
+++ b/final/runtime/test/affinity/format/api.c
@@ -0,0 +1,56 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-run | %python %S/check.py -c 'CHECK' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <omp.h>
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
+
+#define streqls(s1, s2) (!strcmp(s1, s2))
+
+#define check(condition)                                                       \
+  if (!(condition)) {                                                          \
+    fprintf(stderr, "error: %s: %d: " STR(condition) "\n", __FILE__,           \
+            __LINE__);                                                         \
+    exit(1);                                                                   \
+  }
+
+#define BUFFER_SIZE 1024
+
+int main(int argc, char** argv) {
+  char buf[BUFFER_SIZE];
+  size_t needed;
+
+  omp_set_affinity_format("0123456789");
+
+  needed = omp_get_affinity_format(buf, BUFFER_SIZE);
+  check(streqls(buf, "0123456789"));
+  check(needed == 10)
+
+  // Check that it is truncated properly
+  omp_get_affinity_format(buf, 5);
+  check(streqls(buf, "0123"));
+
+  #pragma omp parallel
+  {
+    char my_buf[512];
+    size_t needed = omp_capture_affinity(my_buf, 512, NULL);
+    check(streqls(my_buf, "0123456789"));
+    check(needed == 10);
+    // Check that it is truncated properly
+    omp_capture_affinity(my_buf, 5, NULL);
+    check(streqls(my_buf, "0123"));
+  }
+
+  #pragma omp parallel num_threads(4)
+  {
+    omp_display_affinity(NULL);
+  }
+
+  return 0;
+}
+
+// CHECK: num_threads=4 0123456789
diff --git a/final/runtime/test/affinity/format/api2.c b/final/runtime/test/affinity/format/api2.c
new file mode 100644
index 0000000..c32da93
--- /dev/null
+++ b/final/runtime/test/affinity/format/api2.c
@@ -0,0 +1,84 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-run | %python %S/check.py -c 'CHECK' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <omp.h>
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
+
+#define streqls(s1, s2) (!strcmp(s1, s2))
+
+#define check(condition)                                                       \
+  if (!(condition)) {                                                          \
+    fprintf(stderr, "error: %s: %d: " STR(condition) "\n", __FILE__,           \
+            __LINE__);                                                         \
+    exit(1);                                                                   \
+  }
+
+#if defined(_WIN32)
+#define snprintf _snprintf
+#endif
+
+#define BUFFER_SIZE 1024
+
+int main(int argc, char** argv) {
+  char buf[BUFFER_SIZE];
+  size_t needed, length;
+  const char* format = "tl:%L tn:%n nt:%N an:%a";
+  const char* second_format = "nesting_level:%{nesting_level} thread_num:%{thread_num} num_threads:%{num_threads} ancestor_tnum:%{ancestor_tnum}";
+
+  length = strlen(format);
+  omp_set_affinity_format(format);
+
+  needed = omp_get_affinity_format(buf, BUFFER_SIZE);
+  check(streqls(buf, format));
+  check(needed == length)
+
+  // Check that it is truncated properly
+  omp_get_affinity_format(buf, 5);
+  check(streqls(buf, "tl:%"));
+
+  #pragma omp parallel
+  {
+    char my_buf[512];
+    char supposed[512];
+    int tl, tn, nt, an;
+    size_t needed, needed2;
+    tl = omp_get_level();
+    tn = omp_get_thread_num();
+    nt = omp_get_num_threads();
+    an = omp_get_ancestor_thread_num(omp_get_level()-1);
+    needed = omp_capture_affinity(my_buf, 512, NULL);
+    needed2 = (size_t)snprintf(supposed, 512, "tl:%d tn:%d nt:%d an:%d", tl, tn, nt, an);
+    check(streqls(my_buf, supposed));
+    check(needed == needed2);
+    // Check that it is truncated properly
+    supposed[4] = '\0';
+    omp_capture_affinity(my_buf, 5, NULL);
+    check(streqls(my_buf, supposed));
+
+    needed = omp_capture_affinity(my_buf, 512, second_format);
+    needed2 = (size_t)snprintf(supposed, 512, "nesting_level:%d thread_num:%d num_threads:%d ancestor_tnum:%d", tl, tn, nt, an);
+    check(streqls(my_buf, supposed));
+    check(needed == needed2);
+
+    // Check that it is truncated properly
+    supposed[25] = '\0';
+    omp_capture_affinity(my_buf, 26, second_format);
+    check(streqls(my_buf, supposed));
+  }
+
+  #pragma omp parallel num_threads(4)
+  {
+    omp_display_affinity(NULL);
+    omp_display_affinity(second_format);
+  }
+
+  return 0;
+}
+
+// CHECK: num_threads=4 tl:[0-9]+ tn:[0-9]+ nt:[0-9]+ an:[0-9]+
+// CHECK: num_threads=4 nesting_level:[0-9]+ thread_num:[0-9]+ num_threads:[0-9]+ ancestor_tnum:[0-9]+
diff --git a/final/runtime/test/affinity/format/check.py b/final/runtime/test/affinity/format/check.py
new file mode 100644
index 0000000..0adddbd
--- /dev/null
+++ b/final/runtime/test/affinity/format/check.py
@@ -0,0 +1,73 @@
+import os
+import sys
+import argparse
+import re
+
+class Checks(object):
+    class CheckError(Exception):
+        pass
+
+    def __init__(self, filename, prefix):
+        self.checks = []
+        self.lines = []
+        self.check_no_output = False
+        self.filename = filename
+        self.prefix = prefix
+    def readStdin(self):
+        self.lines = [l.rstrip('\r\n') for l in sys.stdin.readlines()]
+    def readChecks(self):
+        with open(self.filename) as f:
+            for line in f:
+                match = re.search('{}: NO_OUTPUT'.format(self.prefix), line)
+                if match is not None:
+                    self.check_no_output = True
+                    return
+                match = re.search('{}: num_threads=([0-9]+) (.*)$'.format(self.prefix), line)
+                if match is not None:
+                    num_threads = int(match.group(1))
+                    for i in range(num_threads):
+                        self.checks.append(match.group(2))
+                    continue
+    def check(self):
+        # If no checks at all, then nothing to do
+        if len(self.checks) == 0 and not self.check_no_output:
+            print('Nothing to check for')
+            return
+        # Check if we are expecting no output
+        if self.check_no_output:
+            if len(self.lines) == 0:
+                return
+            else:
+                raise Checks.CheckError('{}: Output was found when expecting none.'.format(self.prefix))
+        # Run through each check line and see if it exists in the output
+        # If it does, then delete the line from output and look for the
+        # next check line.
+        # If you don't find the line then raise Checks.CheckError
+        # If there are extra lines of output then raise Checks.CheckError
+        for c in self.checks:
+            found = False
+            index = -1
+            for idx, line in enumerate(self.lines):
+                if re.search(c, line) is not None:
+                    found = True
+                    index = idx
+                    break
+            if not found:
+                raise Checks.CheckError('{}: Did not find: {}'.format(self.prefix, c))
+            else:
+                del self.lines[index]
+        if len(self.lines) != 0:
+            raise Checks.CheckError('{}: Extra output: {}'.format(self.prefix, self.lines))
+
+# Setup argument parsing
+parser = argparse.ArgumentParser(description='''This script checks output of
+    a program against "CHECK" lines in filename''')
+parser.add_argument('filename', default=None, help='filename to check against')
+parser.add_argument('-c', '--check-prefix', dest='prefix',
+                    default='CHECK', help='check prefix token default: %(default)s')
+command_args = parser.parse_args()
+# Do the checking
+checks = Checks(command_args.filename, command_args.prefix)
+checks.readStdin()
+checks.readChecks()
+checks.check()
diff --git a/final/runtime/test/affinity/format/fields_modifiers.c b/final/runtime/test/affinity/format/fields_modifiers.c
new file mode 100644
index 0000000..c180271
--- /dev/null
+++ b/final/runtime/test/affinity/format/fields_modifiers.c
@@ -0,0 +1,117 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <omp.h>
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
+
+#define streqls(s1, s2) (!strcmp(s1, s2))
+
+#define check(condition)                                                       \
+  if (!(condition)) {                                                          \
+    fprintf(stderr, "error: %s: %d: " STR(condition) "\n", __FILE__,           \
+            __LINE__);                                                         \
+    exit(1);                                                                   \
+  }
+
+#define BUFFER_SIZE 1024
+
+char buf[BUFFER_SIZE];
+#pragma omp threadprivate(buf)
+
+char* get_string(size_t check_needed) {
+  size_t needed = omp_capture_affinity(buf, BUFFER_SIZE, NULL);
+  //printf("buf = %s\n", buf);
+  check(needed < BUFFER_SIZE);
+  if (check_needed != 0) {
+    check(needed == check_needed);
+  }
+  return buf;
+}
+
+void check_thread_num_padded_rjustified() {
+  int i;
+  const char* formats[2] = {"%0.8{thread_num}", "%0.8n"};
+  for (i = 0; i < sizeof(formats)/sizeof(formats[0]); ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel num_threads(8)
+    {
+      int j;
+      int tid = omp_get_thread_num();
+      char ctid = '0' + (char)tid;
+      char* s = get_string(8);
+      for (j = 0; j < 7; ++j) {
+        check(s[j] == '0');
+      }
+      check(s[j] == ctid);
+    }
+  }
+}
+
+void check_thread_num_rjustified() {
+  int i;
+  const char* formats[2] = {"%.12{thread_num}", "%.12n"};
+  for (i = 0; i < sizeof(formats)/sizeof(formats[0]); ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel num_threads(8)
+    {
+      int j;
+      int tid = omp_get_thread_num();
+      char ctid = '0' + (char)tid;
+      char* s = get_string(12);
+      for (j = 0; j < 11; ++j) {
+        check(s[j] == ' ');
+      }
+      check(s[j] == ctid);
+    }
+  }
+}
+
+void check_thread_num_ljustified() {
+  int i;
+  const char* formats[2] = {"%5{thread_num}", "%5n"};
+  for (i = 0; i < sizeof(formats)/sizeof(formats[0]); ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel num_threads(8)
+    {
+      int j;
+      int tid = omp_get_thread_num();
+      char ctid = '0' + (char)tid;
+      char* s = get_string(5);
+      check(s[0] == ctid);
+      for (j = 1; j < 5; ++j) {
+        check(s[j] == ' ');
+      }
+    }
+  }
+}
+
+void check_thread_num_padded_ljustified() {
+  int i;
+  const char* formats[2] = {"%018{thread_num}", "%018n"};
+  for (i = 0; i < sizeof(formats)/sizeof(formats[0]); ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel num_threads(8)
+    {
+      int j;
+      int tid = omp_get_thread_num();
+      char ctid = '0' + (char)tid;
+      char* s = get_string(18);
+      check(s[0] == ctid);
+      for (j = 1; j < 18; ++j) {
+        check(s[j] == ' ');
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  check_thread_num_ljustified();
+  check_thread_num_rjustified();
+  check_thread_num_padded_ljustified();
+  check_thread_num_padded_rjustified();
+  return 0;
+}
diff --git a/final/runtime/test/affinity/format/fields_values.c b/final/runtime/test/affinity/format/fields_values.c
new file mode 100644
index 0000000..e56ce27
--- /dev/null
+++ b/final/runtime/test/affinity/format/fields_values.c
@@ -0,0 +1,152 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <omp.h>
+
+#define XSTR(x) #x
+#define STR(x) XSTR(x)
+
+#define streqls(s1, s2) (!strcmp(s1, s2))
+
+#define check(condition)                                                       \
+  if (!(condition)) {                                                          \
+    fprintf(stderr, "error: %s: %d: " STR(condition) "\n", __FILE__,           \
+            __LINE__);                                                         \
+    exit(1);                                                                   \
+  }
+
+#if defined(_WIN32)
+#include <windows.h>
+#define getpid _getpid
+typedef int pid_t;
+#define gettid GetCurrentThreadId
+#define my_gethostname(buf, sz) GetComputerNameA(buf, &(sz))
+#else
+#include <unistd.h>
+#include <sys/types.h>
+#define my_gethostname(buf, sz) gethostname(buf, sz)
+#endif
+
+#define BUFFER_SIZE 256
+
+int get_integer() {
+  int n, retval;
+  char buf[BUFFER_SIZE];
+  size_t needed = omp_capture_affinity(buf, BUFFER_SIZE, NULL);
+  check(needed < BUFFER_SIZE);
+  n = sscanf(buf, "%d", &retval);
+  check(n == 1);
+  return retval;
+}
+
+char* get_string() {
+  int n, retval;
+  char buf[BUFFER_SIZE];
+  size_t needed = omp_capture_affinity(buf, BUFFER_SIZE, NULL);
+  check(needed < BUFFER_SIZE);
+  return strdup(buf);
+}
+
+void check_integer(const char* formats[2], int(*func)()) {
+  int i;
+  for (i = 0; i < 2; ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel num_threads(8)
+    {
+      check(get_integer() == func());
+      #pragma omp parallel num_threads(3)
+      {
+        check(get_integer() == func());
+      }
+      check(get_integer() == func());
+    }
+  }
+}
+
+void check_nesting_level() {
+  // Check %{nesting_level} and %L
+  const char* formats[2] = {"%{nesting_level}", "%L"};
+  check_integer(formats, omp_get_level);
+}
+
+void check_thread_num() {
+  // Check %{thread_num} and %n
+  const char* formats[2] = {"%{thread_num}", "%n"};
+  check_integer(formats, omp_get_thread_num);
+}
+
+void check_num_threads() {
+  // Check %{num_threads} and %N
+  const char* formats[2] = {"%{num_threads}", "%N"};
+  check_integer(formats, omp_get_num_threads);
+}
+
+int ancestor_helper() {
+  return omp_get_ancestor_thread_num(omp_get_level() - 1);
+}
+void check_ancestor_tnum() {
+  // Check %{ancestor_tnum} and %a
+  const char* formats[2] = {"%{ancestor_tnum}", "%a"};
+  check_integer(formats, ancestor_helper);
+}
+
+int my_get_pid() { return (int)getpid(); }
+void check_process_id() {
+  // Check %{process_id} and %P
+  const char* formats[2] = {"%{process_id}", "%P"};
+  check_integer(formats, my_get_pid);
+}
+
+/*
+int my_get_tid() { return (int)gettid(); }
+void check_native_thread_id() {
+  // Check %{native_thread_id} and %i
+  const char* formats[2] = {"%{native_thread_id}", "%i"};
+  check_integer(formats, my_get_tid);
+}
+*/
+
+void check_host() {
+  int i;
+  int buffer_size = 256;
+  const char* formats[2] = {"%{host}", "%H"};
+  char hostname[256];
+  my_gethostname(hostname, buffer_size);
+  for (i = 0; i < 2; ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel num_threads(8)
+    {
+      char* host = get_string();
+      check(streqls(host, hostname));
+      free(host);
+    }
+  }
+}
+
+void check_undefined() {
+  int i;
+  const char* formats[2] = {"%{foobar}", "%X"};
+  for (i = 0; i < 2; ++i) {
+    omp_set_affinity_format(formats[i]);
+    #pragma omp parallel num_threads(8)
+    {
+      char* undef = get_string();
+      check(streqls(undef, "undefined"));
+      free(undef);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  omp_set_nested(1);
+  check_nesting_level();
+  check_num_threads();
+  check_ancestor_tnum();
+  check_process_id();
+  //check_native_thread_id();
+  check_host();
+  check_undefined();
+  return 0;
+}
diff --git a/final/runtime/test/affinity/format/increase.c b/final/runtime/test/affinity/format/increase.c
new file mode 100644
index 0000000..b3942db
--- /dev/null
+++ b/final/runtime/test/affinity/format/increase.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true %libomp-run | %python %S/check.py -c 'CHECK' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int main(int argc, char** argv) {
+  omp_set_affinity_format("TESTER: tl:%L tn:%n nt:%N");
+  // should print all for first parallel
+  omp_set_num_threads(4);
+  #pragma omp parallel
+  { }
+  // should print all because of new threads
+  omp_set_num_threads(8);
+  #pragma omp parallel
+  { }
+  // should not print anything here
+  omp_set_num_threads(6);
+  #pragma omp parallel
+  { }
+  // should print all because of new thread
+  omp_set_num_threads(9);
+  #pragma omp parallel
+  { }
+  // should not print anything here
+  omp_set_num_threads(2);
+  #pragma omp parallel
+  { }
+  return 0;
+}
+
+// CHECK: num_threads=4 TESTER: tl:1 tn:[0-3] nt:4
+// CHECK: num_threads=8 TESTER: tl:1 tn:[0-7] nt:8
+// CHECK: num_threads=6 TESTER: tl:1 tn:[0-5] nt:6
+// CHECK: num_threads=9 TESTER: tl:1 tn:[0-8] nt:9
+// CHECK: num_threads=2 TESTER: tl:1 tn:[01] nt:2
diff --git a/final/runtime/test/affinity/format/lit.local.cfg b/final/runtime/test/affinity/format/lit.local.cfg
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/final/runtime/test/affinity/format/lit.local.cfg
diff --git a/final/runtime/test/affinity/format/nested.c b/final/runtime/test/affinity/format/nested.c
new file mode 100644
index 0000000..2ecc918
--- /dev/null
+++ b/final/runtime/test/affinity/format/nested.c
@@ -0,0 +1,23 @@
+// RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true OMP_PLACES=threads OMP_PROC_BIND=spread,close %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: affinity
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int main(int argc, char** argv) {
+  omp_set_affinity_format("TESTER: tl:%L at:%a tn:%n nt:%N");
+  omp_set_nested(1);
+  #pragma omp parallel num_threads(4)
+  {
+    #pragma omp parallel num_threads(3)
+    { }
+  }
+  return 0;
+}
+
+// CHECK: num_threads=4 TESTER: tl:1 at:0 tn:[0-3] nt:4
+// CHECK: num_threads=3 TESTER: tl:2 at:[0-3] tn:[0-2] nt:3
+// CHECK: num_threads=3 TESTER: tl:2 at:[0-3] tn:[0-2] nt:3
+// CHECK: num_threads=3 TESTER: tl:2 at:[0-3] tn:[0-2] nt:3
+// CHECK: num_threads=3 TESTER: tl:2 at:[0-3] tn:[0-2] nt:3
diff --git a/final/runtime/test/affinity/format/nested2.c b/final/runtime/test/affinity/format/nested2.c
new file mode 100644
index 0000000..4b54912
--- /dev/null
+++ b/final/runtime/test/affinity/format/nested2.c
@@ -0,0 +1,29 @@
+// RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true OMP_PLACES=threads OMP_PROC_BIND=spread,close KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run | %python %S/check.py -c 'CHECK' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+// Currently, KMP_HOT_TEAMS_MAX_LEVEL has to be equal to the
+// nest depth for intuitive behavior
+int main(int argc, char** argv) {
+  omp_set_affinity_format("TESTER: tl:%L tn:%n nt:%N");
+  omp_set_nested(1);
+  #pragma omp parallel num_threads(4)
+  {
+    #pragma omp parallel num_threads(3)
+    { }
+    #pragma omp parallel num_threads(3)
+    { }
+  }
+  #pragma omp parallel num_threads(4)
+  { }
+  return 0;
+}
+
+// CHECK: num_threads=4 TESTER: tl:1 tn:[0-3] nt:4
+// CHECK: num_threads=3 TESTER: tl:2 tn:[0-2] nt:3
+// CHECK: num_threads=3 TESTER: tl:2 tn:[0-2] nt:3
+// CHECK: num_threads=3 TESTER: tl:2 tn:[0-2] nt:3
+// CHECK: num_threads=3 TESTER: tl:2 tn:[0-2] nt:3
+// CHECK: num_threads=4 TESTER: tl:1 tn:[0-3] nt:4
diff --git a/final/runtime/test/affinity/format/nested_mixed.c b/final/runtime/test/affinity/format/nested_mixed.c
new file mode 100644
index 0000000..1e4c753
--- /dev/null
+++ b/final/runtime/test/affinity/format/nested_mixed.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true %libomp-run | %python %S/check.py -c 'CHECK' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int main(int argc, char** argv) {
+  omp_set_affinity_format("TESTER: tl:%L at:%a tn:%n nt:%N");
+  omp_set_nested(1);
+  #pragma omp parallel num_threads(1)
+  {
+    #pragma omp parallel num_threads(2)
+    { }
+    #pragma omp parallel num_threads(2)
+    {
+      #pragma omp parallel num_threads(1)
+      {
+        #pragma omp parallel num_threads(2)
+        { }
+      }
+    }
+    #pragma omp parallel num_threads(1)
+    { }
+  }
+  #pragma omp parallel num_threads(2)
+  { }
+  #pragma omp parallel num_threads(1)
+  { }
+  return 0;
+}
+
+// CHECK: num_threads=1 TESTER: tl:1 at:0 tn:0 nt:1
+
+// CHECK: num_threads=2 TESTER: tl:2 at:[0-9] tn:[01] nt:2
+
+// CHECK: num_threads=1 TESTER: tl:3 at:[0-9] tn:0 nt:1
+// CHECK: num_threads=1 TESTER: tl:3 at:[0-9] tn:0 nt:1
+
+// CHECK: num_threads=2 TESTER: tl:4 at:[0-9] tn:[01] nt:2
+// CHECK: num_threads=2 TESTER: tl:4 at:[0-9] tn:[01] nt:2
+
+// CHECK: num_threads=1 TESTER: tl:2 at:[0-9] tn:0 nt:1
+
+// CHECK: num_threads=2 TESTER: tl:1 at:[0-9] tn:[01] nt:2
+
+// CHECK: num_threads=1 TESTER: tl:1 at:[0-9] tn:0 nt:1
diff --git a/final/runtime/test/affinity/format/nested_serial.c b/final/runtime/test/affinity/format/nested_serial.c
new file mode 100644
index 0000000..8b84ba6
--- /dev/null
+++ b/final/runtime/test/affinity/format/nested_serial.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true %libomp-run | %python %S/check.py -c 'CHECK' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int main(int argc, char** argv) {
+  omp_set_affinity_format("TESTER: tl:%L at:%a tn:%n nt:%N");
+  omp_set_nested(1);
+  #pragma omp parallel num_threads(1)
+  {
+    #pragma omp parallel num_threads(1)
+    { }
+    #pragma omp parallel num_threads(1)
+    { }
+    #pragma omp parallel num_threads(1)
+    {
+      #pragma omp parallel num_threads(1)
+      { }
+    }
+    #pragma omp parallel num_threads(1)
+    { }
+  }
+  #pragma omp parallel num_threads(1)
+  { }
+  #pragma omp parallel num_threads(1)
+  { }
+  return 0;
+}
+
+// CHECK: num_threads=1 TESTER: tl:1 at:0 tn:0 nt:1
+// CHECK: num_threads=1 TESTER: tl:2 at:0 tn:0 nt:1
+// CHECK: num_threads=1 TESTER: tl:3 at:0 tn:0 nt:1
+// CHECK: num_threads=1 TESTER: tl:2 at:0 tn:0 nt:1
+// CHECK: num_threads=1 TESTER: tl:1 at:0 tn:0 nt:1
diff --git a/final/runtime/test/affinity/format/proc_bind.c b/final/runtime/test/affinity/format/proc_bind.c
new file mode 100644
index 0000000..765c3ce
--- /dev/null
+++ b/final/runtime/test/affinity/format/proc_bind.c
@@ -0,0 +1,31 @@
+// RUN: %libomp-compile && env OMP_DISPLAY_AFFINITY=true OMP_PLACES='{0},{0,1},{0},{0,1},{0},{0,1},{0},{0,1},{0},{0,1},{0}' %libomp-run | %python %S/check.py -c 'CHECK' %s
+// REQUIRES: affinity
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int main(int argc, char** argv) {
+  omp_set_affinity_format("TESTER: tl:%L tn:%n nt:%N aff:{%A}");
+  omp_set_num_threads(8);
+  // Initial parallel
+  #pragma omp parallel proc_bind(spread)
+  { }
+  #pragma omp parallel proc_bind(spread)
+  { }
+  // Affinity changes here
+  #pragma omp parallel proc_bind(close)
+  { }
+  #pragma omp parallel proc_bind(close)
+  { }
+  // Affinity changes here
+  #pragma omp parallel proc_bind(master)
+  { }
+  #pragma omp parallel proc_bind(master)
+  { }
+  return 0;
+}
+
+// CHECK: num_threads=8 TESTER: tl:1 tn:[0-7] nt:8 aff:
+// CHECK: num_threads=8 TESTER: tl:1 tn:[0-7] nt:8 aff:
+// CHECK: num_threads=8 TESTER: tl:1 tn:[0-7] nt:8 aff:
diff --git a/final/runtime/test/affinity/format/simple.c b/final/runtime/test/affinity/format/simple.c
new file mode 100644
index 0000000..701c207
--- /dev/null
+++ b/final/runtime/test/affinity/format/simple.c
@@ -0,0 +1,27 @@
+// RUN: %libomp-compile
+// RUN: env OMP_DISPLAY_AFFINITY=false %libomp-run | %python %S/check.py -c 'NOTHING' %s
+// RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=1 %libomp-run | %python %S/check.py -c 'CHECK' %s
+// RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=2 %libomp-run | %python %S/check.py -c 'CHECK-2' %s
+// RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=3 %libomp-run | %python %S/check.py -c 'CHECK-3' %s
+// RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=4 %libomp-run | %python %S/check.py -c 'CHECK-4' %s
+// RUN: env OMP_DISPLAY_AFFINITY=true OMP_NUM_THREADS=8 %libomp-run | %python %S/check.py -c 'CHECK-8' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int main(int argc, char** argv) {
+  omp_set_affinity_format("TESTER: tl:%L tn:%n nt:%N");
+  #pragma omp parallel
+  { }
+  #pragma omp parallel
+  { }
+  return 0;
+}
+
+// NOTHING: NO_OUTPUT
+// CHECK: num_threads=1 TESTER: tl:1 tn:0 nt:1
+// CHECK-2: num_threads=2 TESTER: tl:1 tn:[01] nt:2
+// CHECK-3: num_threads=3 TESTER: tl:1 tn:[0-2] nt:3
+// CHECK-4: num_threads=4 TESTER: tl:1 tn:[0-3] nt:4
+// CHECK-8: num_threads=8 TESTER: tl:1 tn:[0-7] nt:8
diff --git a/final/runtime/test/affinity/format/simple_env.c b/final/runtime/test/affinity/format/simple_env.c
new file mode 100644
index 0000000..ad0a265
--- /dev/null
+++ b/final/runtime/test/affinity/format/simple_env.c
@@ -0,0 +1,16 @@
+// RUN: %libomp-compile
+// RUN: env OMP_DISPLAY_AFFINITY=true OMP_AFFINITY_FORMAT='TESTER-ENV: tl:%L tn:%n nt:%N' OMP_NUM_THREADS=8 %libomp-run | %python %S/check.py -c 'CHECK-8' %s
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+int main(int argc, char** argv) {
+  #pragma omp parallel
+  { }
+  #pragma omp parallel
+  { }
+  return 0;
+}
+
+// CHECK-8: num_threads=8 TESTER-ENV: tl:1 tn:[0-7] nt:8
diff --git a/final/runtime/test/api/has_openmp.c b/final/runtime/test/api/has_openmp.c
new file mode 100644
index 0000000..da95f59
--- /dev/null
+++ b/final/runtime/test/api/has_openmp.c
@@ -0,0 +1,23 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+int test_has_openmp()
+{
+  int rvalue = 0;
+#ifdef _OPENMP
+  rvalue = 1;
+#endif
+  return (rvalue);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+  if(!test_has_openmp()) {
+    num_failed++;
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/kmp_aligned_malloc.c b/final/runtime/test/api/kmp_aligned_malloc.c
new file mode 100644
index 0000000..5302fec
--- /dev/null
+++ b/final/runtime/test/api/kmp_aligned_malloc.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdint.h>
+#include <omp.h>
+#include "omp_testsuite.h"
+
+int alignments[] = {64, 128, 256, 512, 1024, 2048, 4096};
+
+unsigned aligned_by(uint64_t addr) {
+    uint64_t alignment = 1;
+    while((addr & (alignment-1)) == 0) {
+        alignment <<= 1;
+    }
+    return (alignment >> 1);
+}
+
+int test_kmp_aligned_malloc()
+{
+  int err = 0;
+  #pragma omp parallel shared(err)
+  {
+    int i;
+    int* ptr;
+    uint64_t addr;
+    int tid = omp_get_thread_num();
+
+    for(i = 0; i < sizeof(alignments)/sizeof(int); i++) {
+      int alignment = alignments[i];
+      // allocate 64 bytes with 64-byte alignment
+      // allocate 128 bytes with 128-byte alignment, etc.
+      ptr = (int*)kmp_aligned_malloc(alignment, alignment);
+      addr = (uint64_t)ptr;
+      if(addr & (alignment-1)) {
+        printf("thread %d: addr = %p (aligned to %u bytes) but expected "
+               " alignment = %d\n", tid, ptr, aligned_by(addr), alignment);
+        err = 1;
+      }
+      kmp_free(ptr);
+    }
+
+    ptr = kmp_aligned_malloc(128, 127);
+    if (ptr != NULL) {
+      printf("thread %d: kmp_aligned_malloc() didn't return NULL when "
+             "alignment was not power of 2\n", tid);
+      err = 1;
+    }
+  } /* end of parallel */
+  return !err;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_aligned_malloc()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/kmp_set_defaults_lock_bug.c b/final/runtime/test/api/kmp_set_defaults_lock_bug.c
new file mode 100644
index 0000000..73a7afb
--- /dev/null
+++ b/final/runtime/test/api/kmp_set_defaults_lock_bug.c
@@ -0,0 +1,53 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+/* The bug occurs if the lock table is reallocated after
+   kmp_set_defaults() is called.  If the table is reallocated,
+   then the lock will not point to a valid lock object after the
+   kmp_set_defaults() call.*/
+omp_lock_t lock;
+
+int test_kmp_set_defaults_lock_bug()
+{
+  /* checks that omp_get_num_threads is equal to the number of
+     threads */
+  int nthreads_lib;
+  int nthreads = 0;
+
+  nthreads_lib = -1;
+
+  #pragma omp parallel
+  {
+    omp_set_lock(&lock);
+    nthreads++;
+    omp_unset_lock(&lock);
+    #pragma omp single
+    {
+      nthreads_lib = omp_get_num_threads ();
+    }  /* end of single */
+  } /* end of parallel */
+  kmp_set_defaults("OMP_NUM_THREADS");
+  #pragma omp parallel
+  {
+    omp_set_lock(&lock);
+    nthreads++;
+    omp_unset_lock(&lock);
+  } /* end of parallel */
+
+  return (nthreads == 2*nthreads_lib);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+  omp_init_lock(&lock);
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_set_defaults_lock_bug()) {
+      num_failed++;
+    }
+  }
+  omp_destroy_lock(&lock);
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_alloc_def_fb.c b/final/runtime/test/api/omp_alloc_def_fb.c
new file mode 100644
index 0000000..3795f09
--- /dev/null
+++ b/final/runtime/test/api/omp_alloc_def_fb.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main() {
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  void *p[2];
+  at[0].key = OMP_ATK_POOL_SIZE;
+  at[0].value = 2 * 1024 * 1024;
+  at[1].key = OMP_ATK_FALLBACK;
+  at[1].value = OMP_ATV_DEFAULT_MEM_FB;
+  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
+  printf("allocator large created: %p\n", a);
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    p[i] = omp_alloc(1024 * 1024, a);
+    #pragma omp barrier
+    printf("th %d, ptr %p\n", i, p[i]);
+    omp_free(p[i], a);
+  }
+  // Both pointers should be non-NULL
+  if (p[0] != NULL && p[1] != NULL) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: pointers %p %p\n", p[0], p[1]);
+    return 1;
+  }
+}
diff --git a/final/runtime/test/api/omp_alloc_hbw.c b/final/runtime/test/api/omp_alloc_hbw.c
new file mode 100644
index 0000000..e944548
--- /dev/null
+++ b/final/runtime/test/api/omp_alloc_hbw.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main() {
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  void *p[2];
+  at[0].key = OMP_ATK_POOL_SIZE;
+  at[0].value = 2 * 1024 * 1024;
+  at[1].key = OMP_ATK_FALLBACK;
+  at[1].value = OMP_ATV_NULL_FB;
+  a = omp_init_allocator(omp_high_bw_mem_space, 2, at);
+  printf("allocator hbw created: %p\n", a);
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    p[i] = omp_alloc(1024 * 1024, a);
+    #pragma omp barrier
+    printf("th %d, ptr %p\n", i, p[i]);
+    omp_free(p[i], a);
+  }
+  if (a != omp_null_allocator) {
+    // As an allocator has some small memory overhead
+    // exactly one of the two pointers should be NULL
+    // because of NULL fallback requested
+    if ((p[0] == NULL && p[1] != NULL) || (p[0] != NULL && p[1] == NULL)) {
+      printf("passed\n");
+      return 0;
+    } else {
+      printf("failed: pointers %p %p\n", p[0], p[1]);
+      return 1;
+    }
+  } else {
+    // NULL allocator should cause default allocations
+    if (p[0] != NULL && p[1] != NULL) {
+      printf("passed\n");
+      return 0;
+    } else {
+      printf("failed: pointers %p %p\n", p[0], p[1]);
+      return 1;
+    }
+  }
+}
diff --git a/final/runtime/test/api/omp_alloc_null_fb.c b/final/runtime/test/api/omp_alloc_null_fb.c
new file mode 100644
index 0000000..9528c46
--- /dev/null
+++ b/final/runtime/test/api/omp_alloc_null_fb.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+int main() {
+  omp_alloctrait_t at[2];
+  omp_allocator_handle_t a;
+  void *p[2];
+  at[0].key = OMP_ATK_POOL_SIZE;
+  at[0].value = 2 * 1024 * 1024;
+  at[1].key = OMP_ATK_FALLBACK;
+  at[1].value = OMP_ATV_NULL_FB;
+  a = omp_init_allocator(omp_large_cap_mem_space, 2, at);
+  printf("allocator large created: %p\n", a);
+  #pragma omp parallel num_threads(2)
+  {
+    int i = omp_get_thread_num();
+    #pragma omp barrier
+    p[i] = omp_alloc(1024 * 1024, a);
+    #pragma omp barrier
+    printf("th %d, ptr %p\n", i, p[i]);
+    omp_free(p[i], a);
+  }
+  // As an allocator has some small memory overhead
+  // exactly one of the two pointers should be NULL
+  // because of NULL fallback requested
+  if ((p[0] == NULL && p[1] != NULL) || (p[0] != NULL && p[1] == NULL)) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: pointers %p %p\n", p[0], p[1]);
+    return 1;
+  }
+}
diff --git a/final/runtime/test/api/omp_get_num_devices.c b/final/runtime/test/api/omp_get_num_devices.c
new file mode 100644
index 0000000..d534fa3
--- /dev/null
+++ b/final/runtime/test/api/omp_get_num_devices.c
@@ -0,0 +1,24 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_get_num_devices()
+{
+  /* checks that omp_get_device_num */
+  int num_devices = omp_get_num_devices();
+
+  return (num_devices == 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_get_num_devices()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_get_num_threads.c b/final/runtime/test/api/omp_get_num_threads.c
new file mode 100644
index 0000000..daf286d
--- /dev/null
+++ b/final/runtime/test/api/omp_get_num_threads.c
@@ -0,0 +1,39 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_get_num_threads()
+{
+  /* checks that omp_get_num_threads is equal to the number of
+     threads */
+  int nthreads_lib;
+  int nthreads = 0;
+
+  nthreads_lib = -1;
+
+  #pragma omp parallel
+  {
+    #pragma omp critical
+    {
+      nthreads++;
+    } /* end of critical */
+    #pragma omp single
+    {
+      nthreads_lib = omp_get_num_threads ();
+    }  /* end of single */
+  } /* end of parallel */
+  return (nthreads == nthreads_lib);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_get_num_threads()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_get_wtick.c b/final/runtime/test/api/omp_get_wtick.c
new file mode 100644
index 0000000..11a320f
--- /dev/null
+++ b/final/runtime/test/api/omp_get_wtick.c
@@ -0,0 +1,24 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_get_wtick()
+{
+  double tick;
+  tick = -1.;
+  tick = omp_get_wtick ();
+  return ((tick > 0.0) && (tick <= 0.01));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_get_wtick()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_get_wtime.c b/final/runtime/test/api/omp_get_wtime.c
new file mode 100644
index 0000000..b309440
--- /dev/null
+++ b/final/runtime/test/api/omp_get_wtime.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_get_wtime()
+{
+  double start;
+  double end;
+  double measured_time;
+  double wait_time = 5.0;
+  start = 0;
+  end = 0;
+  start = omp_get_wtime();
+  my_sleep (wait_time);
+  end = omp_get_wtime();
+  measured_time = end-start;
+  return ((measured_time > 0.97 * wait_time) && (measured_time < 1.03 * wait_time)) ;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_get_wtime()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_in_parallel.c b/final/runtime/test/api/omp_in_parallel.c
new file mode 100644
index 0000000..5e9e635
--- /dev/null
+++ b/final/runtime/test/api/omp_in_parallel.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * Checks that false is returned when called from serial region
+ * and true is returned when called within parallel region.
+ */
+int test_omp_in_parallel()
+{
+  int serial;
+  int isparallel;
+
+  serial = 1;
+  isparallel = 0;
+  serial = omp_in_parallel();
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      isparallel = omp_in_parallel();
+    }
+  }
+  return (!(serial) && isparallel);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  // the test requires more than 1 thread to pass
+  omp_set_dynamic(0); // disable dynamic adjustment of threads
+  if (omp_get_max_threads() == 1)
+    omp_set_num_threads(2); // set 2 threads if no HW resources available
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_in_parallel()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/api/omp_pause_resource.c b/final/runtime/test/api/omp_pause_resource.c
new file mode 100644
index 0000000..32c1120
--- /dev/null
+++ b/final/runtime/test/api/omp_pause_resource.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_pause_resource() {
+  int fails, nthreads, my_dev;
+
+  fails = 0;
+  nthreads = 0;
+  my_dev = omp_get_initial_device();
+
+#pragma omp parallel
+#pragma omp single
+  nthreads = omp_get_num_threads();
+
+  if (omp_pause_resource(omp_pause_soft, my_dev))
+    fails++;
+
+#pragma omp parallel shared(nthreads)
+#pragma omp single
+  nthreads = omp_get_num_threads();
+
+  if (nthreads == 0)
+    fails++;
+  if (omp_pause_resource(omp_pause_hard, my_dev))
+    fails++;
+  nthreads = 0;
+
+#pragma omp parallel shared(nthreads)
+#pragma omp single
+  nthreads = omp_get_num_threads();
+
+  if (nthreads == 0)
+    fails++;
+  if (omp_pause_resource_all(omp_pause_soft))
+    fails++;
+  nthreads = 0;
+
+#pragma omp parallel shared(nthreads)
+#pragma omp single
+  nthreads = omp_get_num_threads();
+
+  if (nthreads == 0)
+    fails++;
+  return fails == 0;
+}
+
+int main() {
+  int i;
+  int num_failed = 0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_pause_resource()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/atomic/omp_atomic.c b/final/runtime/test/atomic/omp_atomic.c
new file mode 100644
index 0000000..7cdd30d
--- /dev/null
+++ b/final/runtime/test/atomic/omp_atomic.c
@@ -0,0 +1,366 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20  /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_atomic()
+{
+  int sum;
+  int diff;
+  double dsum = 0;
+  double dt = 0.5;  /* base of geometric row for + and - test*/
+  double ddiff;
+  int product;
+  int x;
+  int *logics;
+  int bit_and = 1;
+  int bit_or = 0;
+  int exclusiv_bit_or = 0;
+  int j;
+  int known_sum;
+  int known_diff;
+  int known_product;
+  int result = 0;
+  int logic_and = 1;
+  int logic_or = 0;
+  double dknown_sum;
+  double rounding_error = 1.E-9;
+  double dpt, div;
+  int logicsArray[LOOPCOUNT];
+  logics = logicsArray;
+
+  sum = 0;
+  diff = 0;
+  product = 1;
+
+  // sum of integers test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      #pragma omp atomic
+      sum += i;
+    }
+
+  }
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  if (known_sum != sum)
+  {
+    fprintf(stderr,
+      "Error in sum with integers: Result was %d instead of %d.\n",
+      sum, known_sum);
+    result++;
+  }
+
+  // difference of integers test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; i++) {
+      #pragma omp atomic
+      diff -= i;
+    }
+  }
+  known_diff = ((LOOPCOUNT - 1) * LOOPCOUNT) / 2 * -1;
+  if (diff != known_diff)
+  {
+    fprintf (stderr,
+      "Error in difference with integers: Result was %d instead of 0.\n",
+      diff);
+    result++;
+  }
+
+  // sum of doubles test
+  dsum = 0;
+  dpt = 1;
+  for (j = 0; j < DOUBLE_DIGITS; ++j) {
+    dpt *= dt;
+  }
+  dknown_sum = (1 - dpt) / (1 -dt);
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < DOUBLE_DIGITS; ++i) {
+      #pragma omp atomic
+      dsum += pow (dt, i);
+    }
+  }
+  if (dsum != dknown_sum && (fabs (dsum - dknown_sum) > rounding_error)) {
+    fprintf (stderr, "Error in sum with doubles: Result was %f"
+      " instead of: %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum - dknown_sum);
+    result++;
+  }
+
+  // difference of doubles test
+  dpt = 1;
+  for (j = 0; j < DOUBLE_DIGITS; ++j) {
+    dpt *= dt;
+  }
+  ddiff = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < DOUBLE_DIGITS; ++i) {
+      #pragma omp atomic
+      ddiff -= pow (dt, i);
+    }
+  }
+  if (fabs (ddiff) > rounding_error) {
+    fprintf (stderr,
+      "Error in difference with doubles: Result was %E instead of 0.0\n",
+      ddiff);
+    result++;
+  }
+
+  // product of integers test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 1; i <= MAX_FACTOR; i++) {
+      #pragma omp atomic
+      product *= i;
+    }
+  }
+  known_product = KNOWN_PRODUCT;
+  if (known_product != product) {
+    fprintf (stderr,
+      "Error in product with integers: Result was %d instead of %d\n",
+      product, known_product);
+    result++;
+  }
+
+  // division of integers test
+  product = KNOWN_PRODUCT;
+  #pragma omp parallel
+  {
+     int i;
+    #pragma omp for
+    for (i = 1; i <= MAX_FACTOR; ++i) {
+      #pragma omp atomic
+      product /= i;
+    }
+  }
+  if (product != 1) {
+    fprintf (stderr,
+      "Error in product division with integers: Result was %d"
+      " instead of 1\n",
+      product);
+    result++;
+  }
+
+  // division of doubles test
+  div = 5.0E+5;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 1; i <= MAX_FACTOR; i++) {
+      #pragma omp atomic
+      div /= i;
+    }
+  }
+  if (fabs(div-0.137787) >= 1.0E-4 ) {
+    result++;
+    fprintf (stderr, "Error in division with double: Result was %f"
+      " instead of 0.137787\n", div);
+  }
+
+  // ++ test
+  x = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      x++;
+    }
+  }
+  if (x != LOOPCOUNT) {
+    result++;
+    fprintf (stderr, "Error in ++\n");
+  }
+
+  // -- test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      x--;
+    }
+  }
+  if (x != 0) {
+    result++;
+    fprintf (stderr, "Error in --\n");
+  }
+
+  // bit-and test part 1
+  for (j = 0; j < LOOPCOUNT; ++j) {
+    logics[j] = 1;
+  }
+  bit_and = 1;
+  #pragma omp parallel
+  {
+     int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_and &= logics[i];
+    }
+  }
+  if (!bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 1\n");
+  }
+
+  // bit-and test part 2
+  bit_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_and &= logics[i];
+    }
+  }
+  if (bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 2\n");
+  }
+
+  // bit-or test part 1
+  for (j = 0; j < LOOPCOUNT; j++) {
+    logics[j] = 0;
+  }
+  bit_or = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_or |= logics[i];
+    }
+  }
+  if (bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 1\n");
+  }
+
+  // bit-or test part 2
+  bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+  #pragma omp parallel
+  {
+
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      bit_or |= logics[i];
+    }
+  }
+  if (!bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 2\n");
+  }
+
+  // bit-xor test part 1
+  for (j = 0; j < LOOPCOUNT; j++) {
+    logics[j] = 0;
+  }
+  exclusiv_bit_or = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      exclusiv_bit_or ^= logics[i];
+    }
+  }
+  if (exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  // bit-xor test part 2
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; ++i) {
+      #pragma omp atomic
+      exclusiv_bit_or ^= logics[i];
+    }
+
+  }
+  if (!exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  // left shift test
+  x = 1;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < 10; ++i) {
+      #pragma omp atomic
+      x <<= 1;
+    }
+
+  }
+  if ( x != 1024) {
+    result++;
+    fprintf (stderr, "Error in <<\n");
+    x = 1024;
+  }
+
+  // right shift test
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for
+    for (i = 0; i < 10; ++i) {
+      #pragma omp atomic
+      x >>= 1;
+    }
+  }
+  if (x != 1) {
+    result++;
+    fprintf (stderr, "Error in >>\n");
+  }
+
+  return (result == 0);
+} // test_omp_atomic()
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_atomic()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/barrier/omp_barrier.c b/final/runtime/test/barrier/omp_barrier.c
new file mode 100644
index 0000000..a3fb060
--- /dev/null
+++ b/final/runtime/test/barrier/omp_barrier.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_barrier()
+{
+  int result1;
+  int result2;
+  result1 = 0;
+  result2 = 0;
+
+  #pragma omp parallel
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    if (rank ==1) {
+      my_sleep(((double)SLEEPTIME)/REPETITIONS); // give 1 sec to whole test
+      result2 = 3;
+    }
+    #pragma omp barrier
+    if (rank == 2) {
+      result1 = result2;
+    }
+  }
+  return (result1 == 3);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+#ifdef _OPENMP
+  omp_set_dynamic(0); // prevent runtime to change number of threads
+  omp_set_num_threads(4); // the test expects at least 3 threads
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_barrier()) {
+      num_failed++;
+    }
+  }
+#endif
+  return num_failed;
+}
diff --git a/final/runtime/test/critical/omp_critical.c b/final/runtime/test/critical/omp_critical.c
new file mode 100644
index 0000000..e07dbcb
--- /dev/null
+++ b/final/runtime/test/critical/omp_critical.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_critical()
+{
+  int sum;
+  int known_sum;
+
+  sum=0;
+  #pragma omp parallel
+  {
+    int mysum=0;
+    int i;
+    #pragma omp for
+    for (i = 0; i < 1000; i++)
+      mysum = mysum + i;
+
+    #pragma omp critical
+    sum = mysum +sum;
+  }
+  known_sum = 999 * 1000 / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_critical()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/env/kmp_aff_disable_hwloc.c b/final/runtime/test/env/kmp_aff_disable_hwloc.c
new file mode 100644
index 0000000..5f848ac
--- /dev/null
+++ b/final/runtime/test/env/kmp_aff_disable_hwloc.c
@@ -0,0 +1,21 @@
+// RUN: %libomp-compile && env KMP_AFFINITY=disabled KMP_TOPOLOGY_METHOD=hwloc %libomp-run
+// REQUIRES: hwloc
+#include <stdio.h>
+#include <stdlib.h>
+
+// Test will assert() without fix
+int test_affinity_disabled_plus_hwloc() {
+  #pragma omp parallel
+  {}
+  return 1;
+}
+
+int main(int argc, char **argv) {
+  int i, j;
+  int failed = 0;
+
+  if (!test_affinity_disabled_plus_hwloc()) {
+    failed = 1;
+  }
+  return failed;
+}
diff --git a/final/runtime/test/env/kmp_set_dispatch_buf.c b/final/runtime/test/env/kmp_set_dispatch_buf.c
new file mode 100644
index 0000000..49eb7b5
--- /dev/null
+++ b/final/runtime/test/env/kmp_set_dispatch_buf.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile && env KMP_DISP_NUM_BUFFERS=0 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=1 %libomp-run && env KMP_DISP_NUM_BUFFERS=3 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=4 %libomp-run && env KMP_DISP_NUM_BUFFERS=7 %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && env KMP_DISP_NUM_BUFFERS=1 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=3 %libomp-run && env KMP_DISP_NUM_BUFFERS=4 %libomp-run
+// RUN: env KMP_DISP_NUM_BUFFERS=7 %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 7
+#define MY_MAX 200
+#define MY_MIN -200
+#define NUM_LOOPS 100
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE dynamic
+#endif
+
+int a, b, a_known_value, b_known_value;
+
+int test_kmp_set_disp_num_buffers()
+{
+  int success = 1;
+  a = 0;
+  b = 0;
+  // run many small dynamic loops to stress the dispatch buffer system
+  #pragma omp parallel
+  {
+    int i,j;
+    for (j = 0; j < NUM_LOOPS; j++) {
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+      }
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+      }
+    }
+  }
+  // detect failure
+  if (a != a_known_value || b != b_known_value) {
+    success = 0;
+    printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value,
+           b, b_known_value);
+  }
+  return success;
+}
+
+int main(int argc, char** argv)
+{
+  int i,j;
+  int num_failed=0;
+
+  // figure out the known values to compare with calculated result
+  a_known_value = 0;
+  b_known_value = 0;
+
+  for (j = 0; j < NUM_LOOPS; j++) {
+    for (i = MY_MIN; i < MY_MAX; i+=INCR)
+      a_known_value++;
+    for (i = MY_MAX; i >= MY_MIN; i-=INCR)
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_set_disp_num_buffers()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/env/omp_target_offload.c b/final/runtime/test/env/omp_target_offload.c
new file mode 100644
index 0000000..91ce108
--- /dev/null
+++ b/final/runtime/test/env/omp_target_offload.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run
+#include <string.h>
+#include <stdlib.h>
+
+enum kmp_target_offload_kind {
+  tgt_disabled = 0,
+  tgt_default = 1,
+  tgt_mandatory = 2
+};
+
+extern int __kmpc_get_target_offload();
+
+const char *disabled_examples[] = {
+    // Allowed inputs
+    "disabled", "DISABLED", "Disabled", "dIsAbLeD", "DiSaBlEd"};
+
+const char *default_examples[] = {
+    // Allowed inputs
+    "default", "DEFAULT", "Default", "deFAulT", "DEfaULt",
+    // These should be changed to default (failed match)
+    "mandatry", "defaults", "disable", "enabled", "mandatorynot"};
+
+const char *mandatory_examples[] = {
+    // Allowed inputs
+    "mandatory", "MANDATORY", "Mandatory", "manDatoRy", "MANdATOry"};
+
+// Return target-offload-var ICV
+int get_target_offload_icv() {
+#pragma omp parallel
+  {}
+  return __kmpc_get_target_offload();
+}
+
+int main() {
+  int i;
+  const char *omp_target_offload = "OMP_TARGET_OFFLOAD=";
+  char buf[80];
+
+  for (i = 0; i < sizeof(disabled_examples) / sizeof(char *); ++i) {
+    strcpy(buf, omp_target_offload);
+    strcat(buf, disabled_examples[i]);
+    kmp_set_defaults(buf);
+    if (tgt_disabled != get_target_offload_icv())
+      return EXIT_FAILURE;
+  }
+  for (i = 0; i < sizeof(default_examples) / sizeof(char *); ++i) {
+    strcpy(buf, omp_target_offload);
+    strcat(buf, default_examples[i]);
+    kmp_set_defaults(buf);
+    if (tgt_default != get_target_offload_icv())
+      return EXIT_FAILURE;
+  }
+  for (i = 0; i < sizeof(mandatory_examples) / sizeof(char *); ++i) {
+    strcpy(buf, omp_target_offload);
+    strcat(buf, mandatory_examples[i]);
+    kmp_set_defaults(buf);
+    if (tgt_mandatory != get_target_offload_icv())
+      return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/final/runtime/test/env/omp_thread_limit.c b/final/runtime/test/env/omp_thread_limit.c
new file mode 100644
index 0000000..800edc4
--- /dev/null
+++ b/final/runtime/test/env/omp_thread_limit.c
@@ -0,0 +1,82 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=4 %libomp-run 4
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=7 %libomp-run 7
+//
+// OMP_THREAD_LIMIT=N should imply that no more than N threads are active in
+// a contention group
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+int failed = 0;
+
+void usage() {
+    fprintf(stderr, "usage: omp_thread_limit <n>\n");
+}
+
+void verify(const char* file_name, int line_number, int team_size) {
+  int num_threads = omp_get_num_threads();
+  if (team_size != num_threads) {
+#pragma omp critical(A)
+    {
+      char label[256];
+      snprintf(label, sizeof(label), "%s:%d", file_name, line_number);
+      failed = 1;
+      printf("failed: %s: team_size(%d) != omp_get_num_threads(%d)\n",
+             label, team_size, num_threads);
+    }
+  }
+}
+
+int main(int argc, char** argv)
+{
+  int cl_thread_limit;
+
+  if (argc != 2) {
+    usage();
+    return 1;
+  }
+  cl_thread_limit = atoi(argv[1]);
+
+  omp_set_dynamic(0);
+  if (omp_get_thread_limit() != cl_thread_limit) {
+    fprintf(stderr, "omp_get_thread_limit failed with %d, should be%d\n",
+            omp_get_thread_limit(), cl_thread_limit);
+    return 1;
+  }
+  else if (omp_get_max_threads() > cl_thread_limit) {
+#if _OPENMP
+    int team_size = cl_thread_limit;
+#else
+    int team_size = 1;
+#endif
+    omp_set_num_threads(19);
+    verify(__FILE__, __LINE__, 1);
+#pragma omp parallel
+    {
+      verify(__FILE__, __LINE__, team_size);
+      verify(__FILE__, __LINE__, team_size);
+    }
+    verify(__FILE__, __LINE__, 1);
+
+    omp_set_nested(1);
+#pragma omp parallel num_threads(3)
+    {
+      verify(__FILE__, __LINE__, 3);
+#pragma omp master
+#pragma omp parallel num_threads(21)
+      {
+        verify(__FILE__, __LINE__, team_size-2);
+        verify(__FILE__, __LINE__, team_size-2);
+      }
+    }
+    verify(__FILE__, __LINE__, 1);
+
+    return failed;
+  } else {
+    fprintf(stderr, "This test is not applicable for max num_threads='%d'\n",
+            omp_get_max_threads());
+    return 0;
+  }
+
+}
diff --git a/final/runtime/test/env/omp_wait_policy.c b/final/runtime/test/env/omp_wait_policy.c
new file mode 100644
index 0000000..b260ce4
--- /dev/null
+++ b/final/runtime/test/env/omp_wait_policy.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile && env OMP_WAIT_POLICY=active %libomp-run active
+// RUN: %libomp-compile && env OMP_WAIT_POLICY=passive %libomp-run passive
+//
+// OMP_WAIT_POLICY=active should imply blocktime == INT_MAX
+// i.e., threads spin-wait forever
+// OMP_WAIT_POLICY=passive should imply blocktime == 0
+// i.e., threads immediately sleep
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+void usage() {
+    fprintf(stderr, "usage: omp_wait_policy active|passive\n");
+}
+
+int main(int argc, char** argv)
+{
+  int blocktime, retval=1;
+  const char* env_var_value;
+
+  if (argc != 2) {
+    usage();
+    return 1;
+  }
+
+  blocktime = kmp_get_blocktime();
+
+  env_var_value = argv[1];
+  if (!strcmp(env_var_value, "active")) {
+    retval = (blocktime != INT_MAX);
+  } else if (!strcmp(env_var_value, "passive")) {
+    retval = (blocktime != 0);
+  } else {
+    usage();
+    retval = 1;
+  }
+
+  return retval;
+}
diff --git a/final/runtime/test/flush/omp_flush.c b/final/runtime/test/flush/omp_flush.c
new file mode 100644
index 0000000..95a406d
--- /dev/null
+++ b/final/runtime/test/flush/omp_flush.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_flush()
+{
+  int result1;
+  int result2;
+  int dummy;
+
+  result1 = 0;
+  result2 = 0;
+
+  #pragma omp parallel
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    #pragma omp barrier
+    if (rank == 1) {
+      result2 = 3;
+      #pragma omp flush (result2)
+      dummy = result2;
+    }
+    if (rank == 0) {
+      my_sleep(SLEEPTIME);
+      #pragma omp flush (result2)
+      result1 = result2;
+    }
+  }  /* end of parallel */
+  return ((result1 == result2) && (result2 == dummy) && (result2 == 3));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  // the test requires more than 1 thread to pass
+  omp_set_dynamic(0); // disable dynamic adjustment of threads
+  if (omp_get_max_threads() == 1)
+    omp_set_num_threads(2); // set 2 threads if no HW resources available
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_flush()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lit.cfg b/final/runtime/test/lit.cfg
new file mode 100644
index 0000000..22be385
--- /dev/null
+++ b/final/runtime/test/lit.cfg
@@ -0,0 +1,137 @@
+# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
+# Configuration file for the 'lit' test runner.
+
+import os
+import re
+import subprocess
+import lit.formats
+
+# Tell pylint that we know config and lit_config exist somewhere.
+if 'PYLINT_IMPORT' in os.environ:
+    config = object()
+    lit_config = object()
+
+def append_dynamic_library_path(path):
+    if config.operating_system == 'Windows':
+        name = 'PATH'
+        sep = ';'
+    elif config.operating_system == 'Darwin':
+        name = 'DYLD_LIBRARY_PATH'
+        sep = ':'
+    else:
+        name = 'LD_LIBRARY_PATH'
+        sep = ':'
+    if name in config.environment:
+        config.environment[name] = path + sep + config.environment[name]
+    else:
+        config.environment[name] = path
+
+# name: The name of this test suite.
+config.name = 'libomp'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root object directory where output is placed
+config.test_exec_root = config.libomp_obj_root
+
+# test format
+config.test_format = lit.formats.ShTest()
+
+# compiler flags
+config.test_flags = " -I " + config.test_source_root + \
+    " -I " + config.omp_header_directory + \
+    " -L " + config.library_dir + \
+    " " + config.test_extra_flags
+
+# extra libraries
+libs = ""
+if config.has_libm:
+    libs += " -lm"
+if config.has_libatomic:
+    libs += " -latomic"
+
+# Allow REQUIRES / UNSUPPORTED / XFAIL to work
+config.target_triple = [ ]
+for feature in config.test_compiler_features:
+    config.available_features.add(feature)
+
+# Setup environment to find dynamic library at runtime
+append_dynamic_library_path(config.library_dir)
+if config.using_hwloc:
+    append_dynamic_library_path(config.hwloc_library_dir)
+    config.available_features.add('hwloc')
+
+# Rpath modifications for Darwin
+if config.operating_system == 'Darwin':
+    config.test_flags += " -Wl,-rpath," + config.library_dir
+    if config.using_hwloc:
+        config.test_flags += " -Wl,-rpath," + config.hwloc_library_dir
+
+# Find the SDK on Darwin
+if config.operating_system == 'Darwin':
+  cmd = subprocess.Popen(['xcrun', '--show-sdk-path'],
+                         stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  out, err = cmd.communicate()
+  out = out.strip()
+  res = cmd.wait()
+  if res == 0 and out:
+    config.test_flags += " -isysroot " + out
+
+# Disable OMPT tests if FileCheck was not found
+if config.has_ompt and config.test_filecheck == "":
+    lit_config.note("Not testing OMPT because FileCheck was not found")
+    config.has_ompt = False
+
+if config.has_ompt:
+    config.available_features.add("ompt")
+    # for callback.h
+    config.test_flags += " -I " + config.test_source_root + "/ompt"
+
+if 'Linux' in config.operating_system:
+    config.available_features.add("linux")
+
+if config.operating_system == 'NetBSD':
+    config.available_features.add("netbsd")
+
+if config.operating_system in ['Linux', 'Windows']:
+    config.available_features.add('affinity')
+
+# to run with icc INTEL_LICENSE_FILE must be set
+if 'INTEL_LICENSE_FILE' in os.environ:
+    config.environment['INTEL_LICENSE_FILE'] = os.environ['INTEL_LICENSE_FILE']
+
+
+# substitutions
+config.substitutions.append(("%libomp-compile-and-run", \
+    "%libomp-compile && %libomp-run"))
+config.substitutions.append(("%libomp-cxx-compile-and-run", \
+    "%libomp-cxx-compile && %libomp-run"))
+config.substitutions.append(("%libomp-cxx-compile", \
+    "%clangXX %openmp_flags %flags -std=c++11 %s -o %t" + libs))
+config.substitutions.append(("%libomp-compile", \
+    "%clang %openmp_flags %flags %s -o %t" + libs))
+config.substitutions.append(("%libomp-run", "%t"))
+config.substitutions.append(("%clangXX", config.test_cxx_compiler))
+config.substitutions.append(("%clang", config.test_c_compiler))
+config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
+config.substitutions.append(("%flags", config.test_flags))
+config.substitutions.append(("%python", '"%s"' % (sys.executable)))
+
+if config.has_ompt:
+    config.substitutions.append(("FileCheck", "tee %%t.out | %s" % config.test_filecheck))
+    config.substitutions.append(("%sort-threads", "sort -n -s"))
+    if config.operating_system == 'Windows':
+        # No such environment variable on Windows.
+        config.substitutions.append(("%preload-tool", "true ||"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
+    elif config.operating_system == 'Darwin':
+        config.substitutions.append(("%preload-tool", "env DYLD_INSERT_LIBRARIES=%T/tool.so"))
+        # No such linker flag on Darwin.
+        config.substitutions.append(("%no-as-needed-flag", ""))
+    else:
+        config.substitutions.append(("%preload-tool", "env LD_PRELOAD=%T/tool.so"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
diff --git a/final/runtime/test/lit.site.cfg.in b/final/runtime/test/lit.site.cfg.in
new file mode 100644
index 0000000..c2825ee
--- /dev/null
+++ b/final/runtime/test/lit.site.cfg.in
@@ -0,0 +1,20 @@
+@AUTO_GEN_COMMENT@
+
+config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
+config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
+config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@
+config.test_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
+config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
+config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
+config.libomp_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.library_dir = "@LIBOMP_LIBRARY_DIR@"
+config.omp_header_directory = "@LIBOMP_BINARY_DIR@/src"
+config.operating_system = "@CMAKE_SYSTEM_NAME@"
+config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
+config.using_hwloc = @LIBOMP_USE_HWLOC@
+config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
+config.has_libm = @LIBOMP_HAVE_LIBM@
+config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@LIBOMP_BASE_DIR@/test/lit.cfg")
diff --git a/final/runtime/test/lock/omp_init_lock.c b/final/runtime/test/lock/omp_init_lock.c
new file mode 100644
index 0000000..24b60d1
--- /dev/null
+++ b/final/runtime/test/lock/omp_init_lock.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run
+#include "omp_testsuite.h"
+#include <stdio.h>
+
+// This should be slightly less than KMP_I_LOCK_CHUNK, which is 1024
+#define LOCKS_PER_ITER 1000
+#define ITERATIONS (REPETITIONS + 1)
+
+// This tests concurrently using locks on one thread while initializing new
+// ones on another thread.  This exercises the global lock pool.
+int test_omp_init_lock() {
+  int i;
+  omp_lock_t lcks[ITERATIONS * LOCKS_PER_ITER];
+#pragma omp parallel for schedule(static) num_threads(NUM_TASKS)
+  for (i = 0; i < ITERATIONS; i++) {
+    int j;
+    omp_lock_t *my_lcks = &lcks[i * LOCKS_PER_ITER];
+    for (j = 0; j < LOCKS_PER_ITER; j++) {
+      omp_init_lock(&my_lcks[j]);
+    }
+    for (j = 0; j < LOCKS_PER_ITER * 100; j++) {
+      omp_set_lock(&my_lcks[j % LOCKS_PER_ITER]);
+      omp_unset_lock(&my_lcks[j % LOCKS_PER_ITER]);
+    }
+  }
+  // Wait until all repititions are done.  The test is exercising growth of
+  // the global lock pool, which does not shrink when no locks are allocated.
+  {
+    int j;
+    for (j = 0; j < ITERATIONS * LOCKS_PER_ITER; j++) {
+      omp_destroy_lock(&lcks[j]);
+    }
+  }
+
+  return 0;
+}
+
+int main() {
+  // No use repeating this test, since it's exercising a private global pool
+  // which is not reset between test iterations.
+  return test_omp_init_lock();
+}
diff --git a/final/runtime/test/lock/omp_lock.c b/final/runtime/test/lock/omp_lock.c
new file mode 100644
index 0000000..1301f27
--- /dev/null
+++ b/final/runtime/test/lock/omp_lock.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+// RUN: env KMP_LOCK_KIND=tas KMP_SPIN_BACKOFF_PARAMS=2048,200 %libomp-run
+// RUN: env KMP_LOCK_KIND=futex %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_lock_t lck;
+
+int test_omp_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_lock(&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for(i = 0; i < LOOPCOUNT; i++) {
+      omp_set_lock(&lck);
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_lock(&lck);
+    }
+  }
+  omp_destroy_lock(&lck);
+
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lock/omp_nest_lock.c b/final/runtime/test/lock/omp_nest_lock.c
new file mode 100644
index 0000000..33d7c6a
--- /dev/null
+++ b/final/runtime/test/lock/omp_nest_lock.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_nest_lock_t lck;
+
+int test_omp_nest_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_nest_lock(&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for(i = 0; i < LOOPCOUNT; i++) {
+      omp_set_nest_lock(&lck);
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_nest_lock(&lck);
+    }
+  }
+  omp_destroy_nest_lock(&lck);
+
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_nest_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lock/omp_test_lock.c b/final/runtime/test/lock/omp_test_lock.c
new file mode 100644
index 0000000..c512055
--- /dev/null
+++ b/final/runtime/test/lock/omp_test_lock.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+// RUN: env KMP_LOCK_KIND=tas %libomp-run
+// RUN: env KMP_LOCK_KIND=futex %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+omp_lock_t lck;
+
+int test_omp_test_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_lock (&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; i++) {
+      while (!omp_test_lock (&lck))
+      {};
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_lock (&lck);
+    }
+  }
+  omp_destroy_lock(&lck);
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_test_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/lock/omp_test_nest_lock.c b/final/runtime/test/lock/omp_test_nest_lock.c
new file mode 100644
index 0000000..2fa6fd2
--- /dev/null
+++ b/final/runtime/test/lock/omp_test_nest_lock.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+static omp_nest_lock_t lck;
+
+int test_omp_test_nest_lock()
+{
+  int nr_threads_in_single = 0;
+  int result = 0;
+  int nr_iterations = 0;
+  int i;
+
+  omp_init_nest_lock (&lck);
+  #pragma omp parallel shared(lck)
+  {
+    #pragma omp for
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+      /*omp_set_lock(&lck);*/
+      while(!omp_test_nest_lock (&lck))
+      {};
+      #pragma omp flush
+      nr_threads_in_single++;
+      #pragma omp flush
+      nr_iterations++;
+      nr_threads_in_single--;
+      result = result + nr_threads_in_single;
+      omp_unset_nest_lock (&lck);
+    }
+  }
+  omp_destroy_nest_lock (&lck);
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_test_nest_lock()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/master/omp_master.c b/final/runtime/test/master/omp_master.c
new file mode 100644
index 0000000..1cc7f9e
--- /dev/null
+++ b/final/runtime/test/master/omp_master.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_master()
+{
+  int nthreads;
+  int executing_thread;
+
+  nthreads = 0;
+  executing_thread = -1;
+
+  #pragma omp parallel
+  {
+    #pragma omp master
+    {
+      #pragma omp critical
+      {
+        nthreads++;
+      }
+      executing_thread = omp_get_thread_num();
+    } /* end of master*/
+  } /* end of parallel*/
+  return ((nthreads == 1) && (executing_thread == 0));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_master()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/master/omp_master_3.c b/final/runtime/test/master/omp_master_3.c
new file mode 100644
index 0000000..2e9fdf8
--- /dev/null
+++ b/final/runtime/test/master/omp_master_3.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_master_3()
+{
+  int nthreads;
+  int executing_thread;
+  int tid_result = 0; /* counts up the number of wrong thread no. for
+               the master thread. (Must be 0) */
+  nthreads = 0;
+  executing_thread = -1;
+
+  #pragma omp parallel
+  {
+    #pragma omp master
+    {
+      int tid = omp_get_thread_num();
+      if (tid != 0) {
+        #pragma omp critical
+        { tid_result++; }
+      }
+      #pragma omp critical
+      {
+        nthreads++;
+      }
+      executing_thread = omp_get_thread_num ();
+    } /* end of master*/
+  } /* end of parallel*/
+  return ((nthreads == 1) && (executing_thread == 0) && (tid_result == 0));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_master_3()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/misc_bugs/cancellation_for_sections.c b/final/runtime/test/misc_bugs/cancellation_for_sections.c
new file mode 100644
index 0000000..7cdaa1f
--- /dev/null
+++ b/final/runtime/test/misc_bugs/cancellation_for_sections.c
@@ -0,0 +1,63 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run
+// Clang had a bug until version 4.0.1 which resulted in a hang.
+// UNSUPPORTED: clang-3, clang-4.0.0
+
+// Regression test for a bug in cancellation to cover effect of `#pragma omp cancel`
+// in a loop construct, on sections construct.
+// Pass condition: Cancellation status from `for` does not persist
+// to `sections`.
+
+#include <stdio.h>
+#include <omp.h>
+
+int result[2] = {0, 0};
+
+void cq416850_for_sections() {
+
+    unsigned i;
+     // 1) loop
+    #pragma omp for
+    for (i = 0; i < 1; i++) {
+        result[0] = 1;
+        #pragma omp cancel for
+        result[0] = 2;
+    }
+
+//        printf("thread %d: result[0] = %d, result[1] = %d \n",  omp_get_thread_num(), result[0], result[1]);
+
+
+    // 2) sections
+    #pragma omp sections
+    {
+        #pragma omp section
+        {
+            result[1] = 1;
+            #pragma omp cancellation point sections
+            result[1] = 2;
+        }
+    }
+}
+
+int main(void) {
+    if(!omp_get_cancellation()) {
+        printf("Cancellation not enabled!\n");
+        return 2;
+    }
+
+    #pragma omp parallel num_threads(4)
+    {
+        cq416850_for_sections();
+    }
+
+    if (result[0] != 1 || result[1] != 2) {
+        printf("Incorrect values. "
+               "result[0] = %d (expected 1), "
+               "result[1] = %d (expected 2).\n",
+               result[0], result[1]);
+        printf("FAILED\n");
+        return 1;
+    }
+
+    printf("PASSED\n");
+    return 0;
+}
diff --git a/final/runtime/test/misc_bugs/many-microtask-args.c b/final/runtime/test/misc_bugs/many-microtask-args.c
new file mode 100644
index 0000000..d644515
--- /dev/null
+++ b/final/runtime/test/misc_bugs/many-microtask-args.c
@@ -0,0 +1,39 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+
+int main()
+{
+
+  int i;
+  int i1 = 0;
+  int i2 = 1;
+  int i3 = 2;
+  int i4 = 3;
+  int i5 = 4;
+  int i6 = 6;
+  int i7 = 7;
+  int i8 = 8;
+  int i9 = 9;
+  int i10 = 10;
+  int i11 = 11;
+  int i12 = 12;
+  int i13 = 13;
+  int i14 = 14;
+  int i15 = 15;
+  int i16 = 16;
+ 
+  int r = 0; 
+  #pragma omp parallel for firstprivate(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16) reduction(+:r)
+  for (i = 0; i < i16; i++) {
+    r += i + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9 + i10 + i11 + i12 + i13 + i14 + i15 + i16;
+  }
+
+  int rf = 2216;
+  if (r != rf) {
+    fprintf(stderr, "r should be %d but instead equals %d\n", rf, r);
+    return 1;
+  }
+
+  return 0;
+}
+
diff --git a/final/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c b/final/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c
new file mode 100644
index 0000000..a8400e4
--- /dev/null
+++ b/final/runtime/test/misc_bugs/omp_foreign_thread_team_reuse.c
@@ -0,0 +1,81 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+#define NUM_THREADS 10
+
+/*
+ After hot teams were enabled by default, the library started using levels
+ kept in the team structure.  The levels are broken in case foreign thread
+ exits and puts its team into the pool which is then re-used by another foreign
+ thread. The broken behavior observed is when printing the levels for each
+ new team, one gets 1, 2, 1, 2, 1, 2, etc.  This makes the library believe that
+ every other team is nested which is incorrect.  What is wanted is for the
+ levels to be 1, 1, 1, etc.
+*/
+
+int a = 0;
+int level;
+
+typedef struct thread_arg_t {
+  int iterations;
+} thread_arg_t;
+
+void* thread_function(void* arg) {
+  int i;
+  thread_arg_t* targ = (thread_arg_t*)arg;
+  int iterations = targ->iterations;
+  #pragma omp parallel private(i)
+  {
+    // level should always be 1
+    #pragma omp single
+    level = omp_get_level();
+
+    #pragma omp for
+    for(i = 0; i < iterations; i++) {
+      #pragma omp atomic
+      a++;
+    }
+  }
+}
+
+int test_omp_team_reuse()
+{
+  int i;
+  int success = 1;
+  pthread_t thread[NUM_THREADS];
+  thread_arg_t thread_arg[NUM_THREADS];
+  // launch NUM_THREADS threads, one at a time to perform thread_function()
+  for(i = 0; i < NUM_THREADS; i++) {
+    thread_arg[i].iterations = i + 1;
+    pthread_create(thread+i, NULL, thread_function, thread_arg+i);
+    pthread_join(*(thread+i), NULL);
+    // level read in thread_function()'s parallel region should be 1
+    if(level != 1) {
+      fprintf(stderr, "error: for pthread %d level should be 1 but "
+                      "instead equals %d\n", i, level);
+      success = 0;
+    }
+  }
+  // make sure the for loop works
+  int known_sum = (NUM_THREADS * (NUM_THREADS+1)) / 2;
+  if(a != known_sum) {
+    fprintf(stderr, "a should be %d but instead equals %d\n", known_sum, a);
+    success = 0;
+  }
+  return success;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    a = 0;
+    if(!test_omp_team_reuse()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/misc_bugs/stack-propagate.c b/final/runtime/test/misc_bugs/stack-propagate.c
new file mode 100644
index 0000000..ac289b5
--- /dev/null
+++ b/final/runtime/test/misc_bugs/stack-propagate.c
@@ -0,0 +1,65 @@
+// RUN: %libomp-compile-and-run
+
+// https://bugs.llvm.org/show_bug.cgi?id=26540 requested
+// stack size to be propagated from master to workers.
+// Library implements propagation of not too big stack
+// for Linux x86_64 platform (skipped Windows for now).
+//
+// The test checks that workers can use more than 4MB
+// of stack (4MB - was historical default for
+// stack size of worker thread in runtime library).
+
+#include <stdio.h>
+#include <omp.h>
+#if !defined(_WIN32)
+#include <sys/resource.h> // getrlimit
+#endif
+
+#define STK 4800000
+
+double foo(int n, int th)
+{
+  double arr[n];
+  int i;
+  double res = 0.0;
+  for (i = 0; i < n; ++i) {
+    arr[i] = (double)i / (n + 2);
+  }
+  for (i = 0; i < n; ++i) {
+    res += arr[i] / n;
+  }
+  return res;
+}
+
+int main(int argc, char *argv[])
+{
+#if defined(_WIN32)
+  // don't test Windows
+  printf("stack propagation not implemented, skipping test...\n");
+  return 0;
+#else
+  int status;
+  double val = 0.0;
+  int m = STK / 8; // > 4800000 bytes per thread
+  // read stack size of calling thread, save it as default
+  struct rlimit rlim;
+  status = getrlimit(RLIMIT_STACK, &rlim);
+  if (sizeof(void *) > 4 &&                 // do not test 32-bit systems,
+      status == 0 && rlim.rlim_cur > STK) { // or small initial stack size
+#pragma omp parallel reduction(+:val)
+    {
+      val += foo(m, omp_get_thread_num());
+    }
+  } else {
+    printf("too small stack size limit (needs about 8MB), skipping test...\n");
+    return 0;
+  }
+  if (val > 0.1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed, val = %f\n", val);
+    return 1;
+  }
+#endif // _WIN32
+}
diff --git a/final/runtime/test/misc_bugs/teams-no-par.c b/final/runtime/test/misc_bugs/teams-no-par.c
new file mode 100644
index 0000000..0ef8d9a
--- /dev/null
+++ b/final/runtime/test/misc_bugs/teams-no-par.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile-and-run
+//
+// The test checks the teams construct pseudocode executed on host
+//
+
+#include <stdio.h>
+#include <omp.h>
+
+#ifndef N_TEAMS
+#define N_TEAMS 4
+#endif
+#ifndef N_THR
+#define N_THR 3
+#endif
+
+static int err = 0;
+
+// Internal library staff to emulate compiler's code generation:
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} ident_t;
+
+static ident_t dummy_loc = {0, 2, 0, 0, ";dummyFile;dummyFunc;0;0;;"};
+
+int __kmpc_global_thread_num(void*);
+void __kmpc_push_num_teams(ident_t const*, int, int, int);
+void __kmpc_fork_teams(ident_t const*, int argc, void *microtask, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+// Outlined entry point:
+void foo(int *gtid, int *tid, int *nt)
+{ // start "serial" execution by master threads of each team
+  if ( nt ) {
+    printf(" team %d, param %d\n", omp_get_team_num(), *nt);
+  } else {
+    printf("ERROR: teams before parallel: gtid, tid: %d %d, bad pointer: %p\n", *gtid, *tid, nt);
+    err++;
+    return;
+  }
+}
+
+int main()
+{
+  int nt = 4;
+  int th = __kmpc_global_thread_num(NULL); // registers initial thread
+  __kmpc_push_num_teams(&dummy_loc, th, N_TEAMS, N_THR);
+  __kmpc_fork_teams(&dummy_loc, 1, &foo, &nt); // pass 1 shared parameter "nt"
+  if (err)
+    printf("failed with %d errors\n",err);
+  else
+    printf("passed\n");
+  return err;
+}
diff --git a/final/runtime/test/misc_bugs/teams-reduction.c b/final/runtime/test/misc_bugs/teams-reduction.c
new file mode 100644
index 0000000..6d7cd11
--- /dev/null
+++ b/final/runtime/test/misc_bugs/teams-reduction.c
@@ -0,0 +1,68 @@
+// RUN: %libomp-compile-and-run
+//
+// The test checks the teams construct with reduction executed on the host.
+//
+
+#include <stdio.h>
+#include <omp.h>
+
+#include <stdint.h>
+
+#ifndef N_TEAMS
+#define N_TEAMS 4
+#endif
+#ifndef N_THR
+#define N_THR 3
+#endif
+
+// Internal library stuff to emulate compiler's code generation:
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int32_t reserved_1;
+  int32_t flags;
+  int32_t reserved_2;
+  int32_t reserved_3;
+  char const *psource;
+} ident_t;
+
+static ident_t dummy_loc = {0, 2, 0, 0, ";dummyFile;dummyFunc;0;0;;"};
+
+typedef union {
+  // The global will be used as pointer, so we need to make sure that the
+  // compiler correctly aligns the global...
+  void *ptr;
+  int32_t data[8];
+} kmp_critical_name;
+kmp_critical_name crit;
+
+int32_t __kmpc_global_thread_num(ident_t *);
+void __kmpc_push_num_teams(ident_t *, int32_t global_tid, int32_t num_teams,
+                           int32_t num_threads);
+void __kmpc_fork_teams(ident_t *, int32_t argc, void *microtask, ...);
+int32_t __kmpc_reduce(ident_t *, int32_t global_tid, int32_t num_vars,
+                      size_t reduce_size, void *reduce_data, void *reduce_func,
+                      kmp_critical_name *lck);
+void __kmpc_end_reduce(ident_t *, int32_t global_tid, kmp_critical_name *lck);
+
+#ifdef __cplusplus
+}
+#endif
+
+// Outlined entry point:
+void outlined(int32_t *gtid, int32_t *tid) {
+  int32_t ret = __kmpc_reduce(&dummy_loc, *gtid, 0, 0, NULL, NULL, &crit);
+  __kmpc_end_reduce(&dummy_loc, *gtid, &crit);
+}
+
+int main() {
+  int32_t th = __kmpc_global_thread_num(NULL); // registers initial thread
+  __kmpc_push_num_teams(&dummy_loc, th, N_TEAMS, N_THR);
+  __kmpc_fork_teams(&dummy_loc, 0, &outlined);
+
+  // Test did not hang -> passed!
+  printf("passed\n");
+  return 0;
+}
diff --git a/final/runtime/test/omp_my_sleep.h b/final/runtime/test/omp_my_sleep.h
new file mode 100644
index 0000000..138d930
--- /dev/null
+++ b/final/runtime/test/omp_my_sleep.h
@@ -0,0 +1,33 @@
+#ifndef MY_SLEEP_H
+#define MY_SLEEP_H
+
+/*! Utility function to have a sleep function with better resolution and
+ *  which only stops one thread. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <time.h>
+
+#if defined(_WIN32)
+# include <windows.h>
+// Windows version of my_sleep() function
+static void my_sleep(double sleeptime) {
+  DWORD ms = (DWORD) (sleeptime * 1000.0);
+  Sleep(ms);
+}
+
+
+#else // _WIN32
+
+// Unices version of my_sleep() function
+static void my_sleep(double sleeptime) {
+  struct timespec ts;
+  ts.tv_sec = (time_t)sleeptime;
+  ts.tv_nsec = (long)((sleeptime - (double)ts.tv_sec) * 1E9);
+  nanosleep(&ts, NULL);
+}
+
+#endif // _WIN32
+
+#endif // MY_SLEEP_H
diff --git a/final/runtime/test/omp_testsuite.h b/final/runtime/test/omp_testsuite.h
new file mode 100644
index 0000000..eef5470
--- /dev/null
+++ b/final/runtime/test/omp_testsuite.h
@@ -0,0 +1,79 @@
+/* Global headerfile of the OpenMP Testsuite */
+
+#ifndef OMP_TESTSUITE_H
+#define OMP_TESTSUITE_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+/* General                                                */
+/**********************************************************/
+#define LOOPCOUNT 1000 /* Number of iterations to slit amongst threads */
+#define REPETITIONS 10 /* Number of times to run each test */
+
+/* following times are in seconds */
+#define SLEEPTIME 1
+
+/* Definitions for tasks                                  */
+/**********************************************************/
+#define NUM_TASKS 25
+#define MAX_TASKS_PER_THREAD 5
+
+#ifdef  _WIN32
+// Windows versions of pthread_create() and pthread_join()
+# include <windows.h>
+typedef HANDLE pthread_t;
+
+// encapsulates the information about a pthread-callable function
+struct thread_func_info_t {
+  void* (*start_routine)(void*);
+  void* arg;
+};
+
+// call the void* start_routine(void*);
+static DWORD __thread_func_wrapper(LPVOID lpParameter) {
+  struct thread_func_info_t* function_information;
+  function_information = (struct thread_func_info_t*)lpParameter;
+  function_information->start_routine(function_information->arg);
+  free(function_information);
+  return 0;
+}
+
+// attr is ignored
+static int pthread_create(pthread_t *thread, void *attr,
+                          void *(*start_routine) (void *), void *arg) {
+  pthread_t pthread;
+  struct thread_func_info_t* info;
+  info = (struct thread_func_info_t*)malloc(sizeof(struct thread_func_info_t));
+  info->start_routine = start_routine;
+  info->arg = arg;
+  pthread = CreateThread(NULL, 0, __thread_func_wrapper, info, 0, NULL);
+  if (pthread == NULL) {
+    fprintf(stderr, "CreateThread() failed: Error #%u.\n", GetLastError());
+    exit(1);
+  }
+  *thread = pthread;
+  return 0;
+}
+// retval is ignored for now
+static int pthread_join(pthread_t thread, void **retval) {
+  int rc;
+  rc = WaitForSingleObject(thread, INFINITE);
+  if (rc == WAIT_FAILED) {
+    fprintf(stderr, "WaitForSingleObject() failed: Error #%u.\n",
+            GetLastError());
+    exit(1);
+  }
+  rc = CloseHandle(thread);
+  if (rc == 0) {
+    fprintf(stderr, "CloseHandle() failed: Error #%u.\n", GetLastError());
+    exit(1);
+  }
+  return 0;
+}
+#else
+# include <pthread.h>
+#endif
+
+#endif
diff --git a/final/runtime/test/ompt/callback.h b/final/runtime/test/ompt/callback.h
new file mode 100755
index 0000000..dfb8a17
--- /dev/null
+++ b/final/runtime/test/ompt/callback.h
@@ -0,0 +1,794 @@
+#ifndef _BSD_SOURCE
+#define _BSD_SOURCE
+#endif
+#define _DEFAULT_SOURCE
+#include <stdio.h>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+#include <omp.h>
+#include <omp-tools.h>
+#include "ompt-signal.h"
+
+// Used to detect architecture
+#include "../../src/kmp_platform.h"
+
+static const char* ompt_thread_t_values[] = {
+  NULL,
+  "ompt_thread_initial",
+  "ompt_thread_worker",
+  "ompt_thread_other"
+};
+
+static const char* ompt_task_status_t_values[] = {
+  NULL,
+  "ompt_task_complete",       // 1
+  "ompt_task_yield",          // 2
+  "ompt_task_cancel",         // 3
+  "ompt_task_detach",         // 4
+  "ompt_task_early_fulfill",  // 5
+  "ompt_task_late_fulfill",   // 6
+  "ompt_task_switch"          // 7
+};
+static const char* ompt_cancel_flag_t_values[] = {
+  "ompt_cancel_parallel",
+  "ompt_cancel_sections",
+  "ompt_cancel_loop",
+  "ompt_cancel_taskgroup",
+  "ompt_cancel_activated",
+  "ompt_cancel_detected",
+  "ompt_cancel_discarded_task"
+};
+
+static void format_task_type(int type, char *buffer) {
+  char *progress = buffer;
+  if (type & ompt_task_initial)
+    progress += sprintf(progress, "ompt_task_initial");
+  if (type & ompt_task_implicit)
+    progress += sprintf(progress, "ompt_task_implicit");
+  if (type & ompt_task_explicit)
+    progress += sprintf(progress, "ompt_task_explicit");
+  if (type & ompt_task_target)
+    progress += sprintf(progress, "ompt_task_target");
+  if (type & ompt_task_undeferred)
+    progress += sprintf(progress, "|ompt_task_undeferred");
+  if (type & ompt_task_untied)
+    progress += sprintf(progress, "|ompt_task_untied");
+  if (type & ompt_task_final)
+    progress += sprintf(progress, "|ompt_task_final");
+  if (type & ompt_task_mergeable)
+    progress += sprintf(progress, "|ompt_task_mergeable");
+  if (type & ompt_task_merged)
+    progress += sprintf(progress, "|ompt_task_merged");
+}
+
+static ompt_set_callback_t ompt_set_callback;
+static ompt_get_callback_t ompt_get_callback;
+static ompt_get_state_t ompt_get_state;
+static ompt_get_task_info_t ompt_get_task_info;
+static ompt_get_task_memory_t ompt_get_task_memory;
+static ompt_get_thread_data_t ompt_get_thread_data;
+static ompt_get_parallel_info_t ompt_get_parallel_info;
+static ompt_get_unique_id_t ompt_get_unique_id;
+static ompt_finalize_tool_t ompt_finalize_tool;
+static ompt_get_num_procs_t ompt_get_num_procs;
+static ompt_get_num_places_t ompt_get_num_places;
+static ompt_get_place_proc_ids_t ompt_get_place_proc_ids;
+static ompt_get_place_num_t ompt_get_place_num;
+static ompt_get_partition_place_nums_t ompt_get_partition_place_nums;
+static ompt_get_proc_id_t ompt_get_proc_id;
+static ompt_enumerate_states_t ompt_enumerate_states;
+static ompt_enumerate_mutex_impls_t ompt_enumerate_mutex_impls;
+
+static void print_ids(int level)
+{
+  int task_type, thread_num;
+  ompt_frame_t *frame;
+  ompt_data_t *task_parallel_data;
+  ompt_data_t *task_data;
+  int exists_task = ompt_get_task_info(level, &task_type, &task_data, &frame,
+                                       &task_parallel_data, &thread_num);
+  char buffer[2048];
+  format_task_type(task_type, buffer);
+  if (frame)
+    printf("%" PRIu64 ": task level %d: parallel_id=%" PRIu64
+           ", task_id=%" PRIu64 ", exit_frame=%p, reenter_frame=%p, "
+           "task_type=%s=%d, thread_num=%d\n",
+           ompt_get_thread_data()->value, level,
+           exists_task ? task_parallel_data->value : 0,
+           exists_task ? task_data->value : 0, frame->exit_frame.ptr,
+           frame->enter_frame.ptr, buffer, task_type, thread_num);
+}
+
+#define get_frame_address(level) __builtin_frame_address(level)
+
+#define print_frame(level)                                                     \
+  printf("%" PRIu64 ": __builtin_frame_address(%d)=%p\n",                      \
+         ompt_get_thread_data()->value, level, get_frame_address(level))
+
+// clang (version 5.0 and above) adds an intermediate function call with debug flag (-g)
+#if defined(TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN)
+  #if defined(DEBUG) && defined(__clang__) && __clang_major__ >= 5
+    #define print_frame_from_outlined_fn(level) print_frame(level+1)
+  #else
+    #define print_frame_from_outlined_fn(level) print_frame(level)
+  #endif
+
+  #if defined(__clang__) && __clang_major__ >= 5
+    #warning "Clang 5.0 and later add an additional wrapper for outlined functions when compiling with debug information."
+    #warning "Please define -DDEBUG iff you manually pass in -g to make the tests succeed!"
+  #endif
+#endif
+
+// This macro helps to define a label at the current position that can be used
+// to get the current address in the code.
+//
+// For print_current_address():
+//   To reliably determine the offset between the address of the label and the
+//   actual return address, we insert a NOP instruction as a jump target as the
+//   compiler would otherwise insert an instruction that we can't control. The
+//   instruction length is target dependent and is explained below.
+//
+// (The empty block between "#pragma omp ..." and the __asm__ statement is a
+// workaround for a bug in the Intel Compiler.)
+#define define_ompt_label(id) \
+  {} \
+  __asm__("nop"); \
+ompt_label_##id:
+
+// This macro helps to get the address of a label that is inserted by the above
+// macro define_ompt_label(). The address is obtained with a GNU extension
+// (&&label) that has been tested with gcc, clang and icc.
+#define get_ompt_label_address(id) (&& ompt_label_##id)
+
+// This macro prints the exact address that a previously called runtime function
+// returns to.
+#define print_current_address(id) \
+  define_ompt_label(id) \
+  print_possible_return_addresses(get_ompt_label_address(id))
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+// On X86 the NOP instruction is 1 byte long. In addition, the comiler inserts
+// a MOV instruction for non-void runtime functions which is 3 bytes long.
+#define print_possible_return_addresses(addr) \
+  printf("%" PRIu64 ": current_address=%p or %p for non-void functions\n", \
+         ompt_get_thread_data()->value, ((char *)addr) - 1, ((char *)addr) - 4)
+#elif KMP_ARCH_PPC64
+// On Power the NOP instruction is 4 bytes long. In addition, the compiler
+// inserts a second NOP instruction (another 4 bytes). For non-void runtime
+// functions Clang inserts a STW instruction (but only if compiling under
+// -fno-PIC which will be the default with Clang 8.0, another 4 bytes).
+#define print_possible_return_addresses(addr) \
+  printf("%" PRIu64 ": current_address=%p or %p\n", ompt_get_thread_data()->value, \
+         ((char *)addr) - 8, ((char *)addr) - 12)
+#elif KMP_ARCH_AARCH64
+// On AArch64 the NOP instruction is 4 bytes long, can be followed by inserted
+// store instruction (another 4 bytes long).
+#define print_possible_return_addresses(addr) \
+  printf("%" PRIu64 ": current_address=%p or %p\n", ompt_get_thread_data()->value, \
+         ((char *)addr) - 4, ((char *)addr) - 8)
+#else
+#error Unsupported target architecture, cannot determine address offset!
+#endif
+
+
+// This macro performs a somewhat similar job to print_current_address(), except
+// that it discards a certain number of nibbles from the address and only prints
+// the most significant bits / nibbles. This can be used for cases where the
+// return address can only be approximated.
+//
+// To account for overflows (ie the most significant bits / nibbles have just
+// changed as we are a few bytes above the relevant power of two) the addresses
+// of the "current" and of the "previous block" are printed.
+#define print_fuzzy_address(id) \
+  define_ompt_label(id) \
+  print_fuzzy_address_blocks(get_ompt_label_address(id))
+
+// If you change this define you need to adapt all capture patterns in the tests
+// to include or discard the new number of nibbles!
+#define FUZZY_ADDRESS_DISCARD_NIBBLES 2
+#define FUZZY_ADDRESS_DISCARD_BYTES (1 << ((FUZZY_ADDRESS_DISCARD_NIBBLES) * 4))
+#define print_fuzzy_address_blocks(addr)                                       \
+  printf("%" PRIu64 ": fuzzy_address=0x%" PRIx64 " or 0x%" PRIx64              \
+         " or 0x%" PRIx64 " or 0x%" PRIx64 " (%p)\n",                          \
+         ompt_get_thread_data()->value,                                        \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES - 1,                   \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES,                       \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 1,                   \
+         ((uint64_t)addr) / FUZZY_ADDRESS_DISCARD_BYTES + 2, addr)
+
+#define register_callback_t(name, type)                                        \
+  do {                                                                         \
+    type f_##name = &on_##name;                                                \
+    if (ompt_set_callback(name, (ompt_callback_t)f_##name) == ompt_set_never)  \
+      printf("0: Could not register callback '" #name "'\n");                  \
+  } while (0)
+
+#define register_callback(name) register_callback_t(name, name##_t)
+
+#ifndef USE_PRIVATE_TOOL
+static void
+on_ompt_callback_mutex_acquire(
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_wait_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_wait_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_wait_critical: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_wait_atomic: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_wait_ordered: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_mutex_acquired(
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_acquired_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_first: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_acquired_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_acquired_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_acquired_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_mutex_released(
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_release_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_release_nest_lock_last: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_critical:
+      printf("%" PRIu64 ": ompt_event_release_critical: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_atomic:
+      printf("%" PRIu64 ": ompt_event_release_atomic: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_ordered:
+      printf("%" PRIu64 ": ompt_event_release_ordered: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_nest_lock(
+    ompt_scope_endpoint_t endpoint,
+    ompt_wait_id_t wait_id,
+    const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_acquired_nest_lock_next: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_release_nest_lock_prev: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region(
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+        case ompt_sync_region_barrier_implicit:
+        case ompt_sync_region_barrier_explicit:
+        case ompt_sync_region_barrier_implementation:
+          printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          print_ids(0);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_reduction:
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+        case ompt_sync_region_barrier_implicit:
+        case ompt_sync_region_barrier_explicit:
+        case ompt_sync_region_barrier_implementation:
+          printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_reduction:
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region_wait(
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+        case ompt_sync_region_barrier_implicit:
+        case ompt_sync_region_barrier_explicit:
+        case ompt_sync_region_barrier_implementation:
+          printf("%" PRIu64 ": ompt_event_wait_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_wait_taskwait_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_reduction:
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(kind)
+      {
+        case ompt_sync_region_barrier:
+        case ompt_sync_region_barrier_implicit:
+        case ompt_sync_region_barrier_explicit:
+        case ompt_sync_region_barrier_implementation:
+          printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskwait:
+          printf("%" PRIu64 ": ompt_event_wait_taskwait_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_taskgroup:
+          printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+          break;
+        case ompt_sync_region_reduction:
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_flush(
+    ompt_data_t *thread_data,
+    const void *codeptr_ra)
+{
+  printf("%" PRIu64 ": ompt_event_flush: codeptr_ra=%p\n", thread_data->value, codeptr_ra);
+}
+
+static void
+on_ompt_callback_cancel(
+    ompt_data_t *task_data,
+    int flags,
+    const void *codeptr_ra)
+{
+  const char* first_flag_value;
+  const char* second_flag_value;
+  if(flags & ompt_cancel_parallel)
+    first_flag_value = ompt_cancel_flag_t_values[0];
+  else if(flags & ompt_cancel_sections)
+    first_flag_value = ompt_cancel_flag_t_values[1];
+  else if(flags & ompt_cancel_loop)
+    first_flag_value = ompt_cancel_flag_t_values[2];
+  else if(flags & ompt_cancel_taskgroup)
+    first_flag_value = ompt_cancel_flag_t_values[3];
+
+  if(flags & ompt_cancel_activated)
+    second_flag_value = ompt_cancel_flag_t_values[4];
+  else if(flags & ompt_cancel_detected)
+    second_flag_value = ompt_cancel_flag_t_values[5];
+  else if(flags & ompt_cancel_discarded_task)
+    second_flag_value = ompt_cancel_flag_t_values[6];
+
+  printf("%" PRIu64 ": ompt_event_cancel: task_data=%" PRIu64 ", flags=%s|%s=%" PRIu32 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, task_data->value, first_flag_value, second_flag_value, flags,  codeptr_ra);
+}
+
+static void
+on_ompt_callback_implicit_task(
+    ompt_scope_endpoint_t endpoint,
+    ompt_data_t *parallel_data,
+    ompt_data_t *task_data,
+    unsigned int team_size,
+    unsigned int thread_num,
+    int flags)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      if(task_data->ptr)
+        printf("%s\n", "0: task_data initially not null");
+      task_data->value = ompt_get_unique_id();
+
+      //there is no parallel_begin callback for implicit parallel region
+      //thus it is initialized in initial task
+      if(flags & ompt_task_initial)
+      {
+        char buffer[2048];
+
+        format_task_type(flags, buffer);
+        if(parallel_data->ptr)
+          printf("%s\n", "0: parallel_data initially not null");
+        parallel_data->value = ompt_get_unique_id();
+        printf("%" PRIu64 ": ompt_event_initial_task_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", actual_parallelism=%" PRIu32 ", index=%" PRIu32 ", flags=%" PRIu32 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, team_size, thread_num, flags);
+      } else {
+        printf("%" PRIu64 ": ompt_event_implicit_task_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, team_size, thread_num);
+      }
+
+      break;
+    case ompt_scope_end:
+      if(flags & ompt_task_initial){
+        printf("%" PRIu64 ": ompt_event_initial_task_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, team_size, thread_num);
+      } else {
+        printf("%" PRIu64 ": ompt_event_implicit_task_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", team_size=%" PRIu32 ", thread_num=%" PRIu32 "\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, team_size, thread_num);
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_lock_init(
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_init_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_init_nest_lock: wait_id=%" PRIu64 ", hint=%" PRIu32 ", impl=%" PRIu32 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, hint, impl, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_lock_destroy(
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra)
+{
+  switch(kind)
+  {
+    case ompt_mutex_lock:
+      printf("%" PRIu64 ": ompt_event_destroy_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    case ompt_mutex_nest_lock:
+      printf("%" PRIu64 ": ompt_event_destroy_nest_lock: wait_id=%" PRIu64 ", codeptr_ra=%p \n", ompt_get_thread_data()->value, wait_id, codeptr_ra);
+      break;
+    default:
+      break;
+  }
+}
+
+static void
+on_ompt_callback_work(
+  ompt_work_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      switch(wstype)
+      {
+        case ompt_work_loop:
+          printf("%" PRIu64 ": ompt_event_loop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_sections:
+          printf("%" PRIu64 ": ompt_event_sections_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_executor:
+          printf("%" PRIu64 ": ompt_event_single_in_block_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_other:
+          printf("%" PRIu64 ": ompt_event_single_others_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_workshare:
+          //impl
+          break;
+        case ompt_work_distribute:
+          printf("%" PRIu64 ": ompt_event_distribute_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_taskloop:
+          //impl
+          printf("%" PRIu64 ": ompt_event_taskloop_begin: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+      }
+      break;
+    case ompt_scope_end:
+      switch(wstype)
+      {
+        case ompt_work_loop:
+          printf("%" PRIu64 ": ompt_event_loop_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_sections:
+          printf("%" PRIu64 ": ompt_event_sections_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_executor:
+          printf("%" PRIu64 ": ompt_event_single_in_block_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_single_other:
+          printf("%" PRIu64 ": ompt_event_single_others_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_workshare:
+          //impl
+          break;
+        case ompt_work_distribute:
+          printf("%" PRIu64 ": ompt_event_distribute_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+        case ompt_work_taskloop:
+          //impl
+          printf("%" PRIu64 ": ompt_event_taskloop_end: parallel_id=%" PRIu64 ", parent_task_id=%" PRIu64 ", codeptr_ra=%p, count=%" PRIu64 "\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra, count);
+          break;
+      }
+      break;
+  }
+}
+
+static void
+on_ompt_callback_master(
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      printf("%" PRIu64 ": ompt_event_master_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      printf("%" PRIu64 ": ompt_event_master_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+  }
+}
+
+static void on_ompt_callback_parallel_begin(
+    ompt_data_t *encountering_task_data,
+    const ompt_frame_t *encountering_task_frame, ompt_data_t *parallel_data,
+    uint32_t requested_team_size, int flag, const void *codeptr_ra) {
+  if(parallel_data->ptr)
+    printf("0: parallel_data initially not null\n");
+  parallel_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_parallel_begin: parent_task_id=%" PRIu64
+         ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, "
+         "parallel_id=%" PRIu64 ", requested_team_size=%" PRIu32
+         ", codeptr_ra=%p, invoker=%d\n",
+         ompt_get_thread_data()->value, encountering_task_data->value,
+         encountering_task_frame->exit_frame.ptr,
+         encountering_task_frame->enter_frame.ptr, parallel_data->value,
+         requested_team_size, codeptr_ra, flag);
+}
+
+static void on_ompt_callback_parallel_end(ompt_data_t *parallel_data,
+                                          ompt_data_t *encountering_task_data,
+                                          int flag, const void *codeptr_ra) {
+  printf("%" PRIu64 ": ompt_event_parallel_end: parallel_id=%" PRIu64
+         ", task_id=%" PRIu64 ", invoker=%d, codeptr_ra=%p\n",
+         ompt_get_thread_data()->value, parallel_data->value,
+         encountering_task_data->value, flag, codeptr_ra);
+}
+
+static void
+on_ompt_callback_task_create(
+    ompt_data_t *encountering_task_data,
+    const ompt_frame_t *encountering_task_frame,
+    ompt_data_t* new_task_data,
+    int type,
+    int has_dependences,
+    const void *codeptr_ra)
+{
+  if(new_task_data->ptr)
+    printf("0: new_task_data initially not null\n");
+  new_task_data->value = ompt_get_unique_id();
+  char buffer[2048];
+
+  format_task_type(type, buffer);
+
+  printf("%" PRIu64 ": ompt_event_task_create: parent_task_id=%" PRIu64 ", parent_task_frame.exit=%p, parent_task_frame.reenter=%p, new_task_id=%" PRIu64 ", codeptr_ra=%p, task_type=%s=%d, has_dependences=%s\n", ompt_get_thread_data()->value, encountering_task_data ? encountering_task_data->value : 0, encountering_task_frame ? encountering_task_frame->exit_frame.ptr : NULL, encountering_task_frame ? encountering_task_frame->enter_frame.ptr : NULL, new_task_data->value, codeptr_ra, buffer, type, has_dependences ? "yes" : "no");
+}
+
+static void
+on_ompt_callback_task_schedule(
+    ompt_data_t *first_task_data,
+    ompt_task_status_t prior_task_status,
+    ompt_data_t *second_task_data)
+{
+  printf("%" PRIu64 ": ompt_event_task_schedule: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 ", prior_task_status=%s=%d\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value, ompt_task_status_t_values[prior_task_status], prior_task_status);
+  if(prior_task_status == ompt_task_complete)
+  {
+    printf("%" PRIu64 ": ompt_event_task_end: task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value);
+  }
+}
+
+static void
+on_ompt_callback_dependences(
+  ompt_data_t *task_data,
+  const ompt_dependence_t *deps,
+  int ndeps)
+{
+  printf("%" PRIu64 ": ompt_event_task_dependences: task_id=%" PRIu64 ", deps=%p, ndeps=%d\n", ompt_get_thread_data()->value, task_data->value, (void *)deps, ndeps);
+}
+
+static void
+on_ompt_callback_task_dependence(
+  ompt_data_t *first_task_data,
+  ompt_data_t *second_task_data)
+{
+  printf("%" PRIu64 ": ompt_event_task_dependence_pair: first_task_id=%" PRIu64 ", second_task_id=%" PRIu64 "\n", ompt_get_thread_data()->value, first_task_data->value, second_task_data->value);
+}
+
+static void
+on_ompt_callback_thread_begin(
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data)
+{
+  if(thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  thread_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, ompt_thread_t_values[thread_type], thread_type, thread_data->value);
+}
+
+static void
+on_ompt_callback_thread_end(
+  ompt_data_t *thread_data)
+{
+  printf("%" PRIu64 ": ompt_event_thread_end: thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, thread_data->value);
+}
+
+static int
+on_ompt_callback_control_tool(
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra)
+{
+  ompt_frame_t* omptTaskFrame;
+  ompt_get_task_info(0, NULL, (ompt_data_t**) NULL, &omptTaskFrame, NULL, NULL);
+  printf("%" PRIu64 ": ompt_event_control_tool: command=%" PRIu64 ", modifier=%" PRIu64 ", arg=%p, codeptr_ra=%p, current_task_frame.exit=%p, current_task_frame.reenter=%p \n", ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra, omptTaskFrame->exit_frame.ptr, omptTaskFrame->enter_frame.ptr);
+  return 0; //success
+}
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  int initial_device_num,
+  ompt_data_t *tool_data)
+{
+  ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+  ompt_get_callback = (ompt_get_callback_t) lookup("ompt_get_callback");
+  ompt_get_state = (ompt_get_state_t) lookup("ompt_get_state");
+  ompt_get_task_info = (ompt_get_task_info_t) lookup("ompt_get_task_info");
+  ompt_get_task_memory = (ompt_get_task_memory_t)lookup("ompt_get_task_memory");
+  ompt_get_thread_data = (ompt_get_thread_data_t) lookup("ompt_get_thread_data");
+  ompt_get_parallel_info = (ompt_get_parallel_info_t) lookup("ompt_get_parallel_info");
+  ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
+  ompt_finalize_tool = (ompt_finalize_tool_t)lookup("ompt_finalize_tool");
+
+  ompt_get_num_procs = (ompt_get_num_procs_t) lookup("ompt_get_num_procs");
+  ompt_get_num_places = (ompt_get_num_places_t) lookup("ompt_get_num_places");
+  ompt_get_place_proc_ids = (ompt_get_place_proc_ids_t) lookup("ompt_get_place_proc_ids");
+  ompt_get_place_num = (ompt_get_place_num_t) lookup("ompt_get_place_num");
+  ompt_get_partition_place_nums = (ompt_get_partition_place_nums_t) lookup("ompt_get_partition_place_nums");
+  ompt_get_proc_id = (ompt_get_proc_id_t) lookup("ompt_get_proc_id");
+  ompt_enumerate_states = (ompt_enumerate_states_t) lookup("ompt_enumerate_states");
+  ompt_enumerate_mutex_impls = (ompt_enumerate_mutex_impls_t) lookup("ompt_enumerate_mutex_impls");
+
+  register_callback(ompt_callback_mutex_acquire);
+  register_callback_t(ompt_callback_mutex_acquired, ompt_callback_mutex_t);
+  register_callback_t(ompt_callback_mutex_released, ompt_callback_mutex_t);
+  register_callback(ompt_callback_nest_lock);
+  register_callback(ompt_callback_sync_region);
+  register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback(ompt_callback_control_tool);
+  register_callback(ompt_callback_flush);
+  register_callback(ompt_callback_cancel);
+  register_callback(ompt_callback_implicit_task);
+  register_callback_t(ompt_callback_lock_init, ompt_callback_mutex_acquire_t);
+  register_callback_t(ompt_callback_lock_destroy, ompt_callback_mutex_t);
+  register_callback(ompt_callback_work);
+  register_callback(ompt_callback_master);
+  register_callback(ompt_callback_parallel_begin);
+  register_callback(ompt_callback_parallel_end);
+  register_callback(ompt_callback_task_create);
+  register_callback(ompt_callback_task_schedule);
+  register_callback(ompt_callback_dependences);
+  register_callback(ompt_callback_task_dependence);
+  register_callback(ompt_callback_thread_begin);
+  register_callback(ompt_callback_thread_end);
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
+#ifdef __cplusplus
+}
+#endif
+#endif // ifndef USE_PRIVATE_TOOL
diff --git a/final/runtime/test/ompt/cancel/cancel_parallel.c b/final/runtime/test/ompt/cancel/cancel_parallel.c
new file mode 100644
index 0000000..9456b67
--- /dev/null
+++ b/final/runtime/test/ompt/cancel/cancel_parallel.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implementation does not support cancellation
+// XFAIL: gcc
+
+#include "callback.h"
+#include "omp.h"
+
+int main() {
+  #pragma omp parallel num_threads(2)
+  {
+    if (omp_get_thread_num() == 0) {
+      print_fuzzy_address_blocks(get_ompt_label_address(1));
+      #pragma omp cancel parallel
+      define_ompt_label(1);
+      // We cannot print at this location because the parallel region is cancelled!
+    } else {
+      delay(100);
+      print_fuzzy_address_blocks(get_ompt_label_address(2));
+      #pragma omp cancellation point parallel
+      define_ompt_label(2);
+      // We cannot print at this location because the parallel region is cancelled!
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, actual_parallelism=1, index=1, flags=1 
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_parallel|ompt_cancel_activated=17, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_parallel|ompt_cancel_detected=33, codeptr_ra=[[OTHER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[OTHER_RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/cancel/cancel_taskgroup.c b/final/runtime/test/ompt/cancel/cancel_taskgroup.c
new file mode 100644
index 0000000..fce39c9
--- /dev/null
+++ b/final/runtime/test/ompt/cancel/cancel_taskgroup.c
@@ -0,0 +1,89 @@
+// RUN:  %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: clang-3, clang-4.0.0
+// Current GOMP interface implementation does not support cancellation; icc 16 has a bug
+// XFAIL: gcc, icc-16
+
+#include "callback.h"
+#include <unistd.h>  
+#include <stdio.h>
+
+int main()
+{
+  int condition=0;
+  #pragma omp parallel num_threads(2)
+  {}
+
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp taskgroup
+      {
+        #pragma omp task shared(condition)
+        {
+          printf("start execute task 1\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 1\n");
+        }
+        #pragma omp task shared(condition)
+        {
+          printf("start execute task 2\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 2\n");
+        }
+      #pragma omp task shared(condition)
+        {
+          printf("start execute task 3\n");
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+          #pragma omp cancellation point taskgroup
+          printf("end execute task 3\n");
+        }
+      #pragma omp task if(0) shared(condition)
+        {
+          printf("start execute task 4\n");
+          OMPT_WAIT(condition,1);
+          #pragma omp cancel taskgroup
+          printf("end execute task 4\n");
+        }
+        OMPT_SIGNAL(condition);
+      }
+    }
+    #pragma omp barrier
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[FIRST_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[SECOND_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[THIRD_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit=4, has_dependences=no
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[PARENT_TASK_ID]], parent_task_frame.exit={{0x[0-f]*}}, parent_task_frame.reenter={{0x[0-f]*}}, new_task_id=[[CANCEL_TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[PARENT_TASK_ID]], second_task_id=[[CANCEL_TASK_ID]], prior_task_status=ompt_task_switch=7
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[CANCEL_TASK_ID]], flags=ompt_cancel_taskgroup|ompt_cancel_activated=24, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[CANCEL_TASK_ID]], second_task_id=[[PARENT_TASK_ID]], prior_task_status=ompt_task_cancel=3
+
+  // CHECK-DAG: {{^}}{{[0-9]+}}: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_discarded_task=72, codeptr_ra=[[NULL]]
+  // CHECK-DAG: {{^}}{{[0-9]+}}: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_discarded_task=72, codeptr_ra=[[NULL]]
+  
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_cancel: task_data={{[0-9]+}}, flags=ompt_cancel_taskgroup|ompt_cancel_detected=40, codeptr_ra={{0x[0-f]*}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/cancel/cancel_worksharing.c b/final/runtime/test/ompt/cancel/cancel_worksharing.c
new file mode 100644
index 0000000..8576f96
--- /dev/null
+++ b/final/runtime/test/ompt/cancel/cancel_worksharing.c
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile && env OMP_CANCELLATION=true %libomp-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implementation does not support cancellation; icc 16 does not distinguish between sections and loops
+// XFAIL: icc-16
+
+#include "callback.h"
+#include <unistd.h>
+
+int main()
+{
+  int condition=0;
+  #pragma omp parallel num_threads(2)
+  {
+    int x = 0;
+    int i;
+    #pragma omp for
+    for(i = 0; i < 2; i++)
+    {
+      if(i == 0)
+      {
+        x++;
+        OMPT_SIGNAL(condition);
+        #pragma omp cancel for
+      }
+      else
+      {
+        x++;
+        OMPT_WAIT(condition,1);
+        delay(10000);
+        #pragma omp cancellation point for
+      }
+    }
+  }
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp sections
+    {
+      #pragma omp section
+      {
+        OMPT_SIGNAL(condition);
+        #pragma omp cancel sections
+      }
+      #pragma omp section
+      {
+        OMPT_WAIT(condition,2);
+        delay(10000);
+        #pragma omp cancellation point sections
+      }
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, actual_parallelism=1, index=1, flags=1
+ 
+  // cancel for and sections
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_loop|ompt_cancel_activated=20, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_sections|ompt_cancel_{{activated=18|detected=34}}, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[OTHER_THREAD_ID:[0-9]+]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_loop|ompt_cancel_detected=36, codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[OTHER_THREAD_ID:[0-9]+]]: ompt_event_cancel: task_data=[[TASK_ID:[0-9]+]], flags=ompt_cancel_sections|ompt_cancel_{{activated=18|detected=34}}, codeptr_ra={{0x[0-f]*}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/loadtool/tool_available/tool_available.c b/final/runtime/test/ompt/loadtool/tool_available/tool_available.c
new file mode 100644
index 0000000..25187fd
--- /dev/null
+++ b/final/runtime/test/ompt/loadtool/tool_available/tool_available.c
@@ -0,0 +1,74 @@
+// The OpenMP standard defines 3 ways of providing ompt_start_tool:
+// 1. "statically-linking the tool’s definition of ompt_start_tool into an OpenMP application"
+// RUN: %libomp-compile -DCODE -DTOOL && %libomp-run | FileCheck %s
+
+// Note: We should compile the tool without -fopenmp as other tools developer
+//       would do. Otherwise this test may pass for the wrong reasons on Darwin.
+// RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
+// 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
+// 2.1 Link with tool during compilation
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.2 Link with tool during compilation, but AFTER the runtime
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.3 Inject tool via the dynamic loader
+// RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
+
+// 3. "providing the name of a dynamically-linked library appropriate for the architecture and operating system used by the application in the tool-libraries-var ICV"
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+
+// REQUIRES: ompt
+
+/*
+ *  This file contains code for an OMPT shared library tool to be
+ *  loaded and the code for the OpenMP executable.
+ *  -DTOOL enables the code for the tool during compilation
+ *  -DCODE enables the code for the executable during compilation
+ */
+
+#ifdef CODE
+#include "omp.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}0: ompt_event_runtime_shutdown
+
+  return 0;
+}
+
+#endif /* CODE */
+
+#ifdef TOOL
+
+#include <stdio.h>
+#include <omp-tools.h>
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_data_t* tool_data)
+{
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_data_t* tool_data)
+{
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
+#endif /* TOOL */
diff --git a/final/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c b/final/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c
new file mode 100644
index 0000000..fedfebe
--- /dev/null
+++ b/final/runtime/test/ompt/loadtool/tool_available_search/tool_available_search.c
@@ -0,0 +1,104 @@
+// RUN: %clang %flags -shared -fPIC %s -o %T/first_tool.so
+// RUN: %clang %flags -DTOOL -DSECOND_TOOL -shared -fPIC %s -o %T/second_tool.so
+// RUN: %clang %flags -DTOOL -DTHIRD_TOOL -shared -fPIC %s -o %T/third_tool.so
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/non_existing_file.so:%T/first_tool.so:%T/second_tool.so:%T/third_tool.so %libomp-run | FileCheck %s
+
+// REQUIRES: ompt
+
+/*
+ *  This file contains code for three OMPT shared library tool to be 
+ *  loaded and the code for the OpenMP executable. 
+ *  No option enables code for the first shared library 
+ *  (without an implementation of ompt_start_tool) during compilation
+ *  -DTOOL -DSECOND_TOOL enables the code for the second tool during compilation
+ *  -DTOOL -DTHIRD_TOOL enables the code for the third tool during compilation
+ *  -DCODE enables the code for the executable during compilation
+ */
+
+#ifdef CODE
+#include "stdio.h"
+#include "omp.h"
+#include "omp-tools.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      int result = omp_control_tool(omp_control_tool_start, 0, NULL);
+      printf("0: control_tool()=%d\n", result);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 
+  
+  // CHECK: {{^}}0: Do not initialize tool
+
+  // CHECK: {{^}}0: Do initialize tool
+  // CHECK: {{^}}0: Tool initialized
+  // CHECK: {{^}}0: ompt_event_thread_begin
+  // CHECK-DAG: {{^}}0: ompt_event_thread_begin
+  // CHECK-DAG: {{^}}0: control_tool()=-1
+  // CHECK: {{^}}0: Tool finalized
+  
+
+  return 0;
+}
+
+#endif /* CODE */
+
+#ifdef TOOL
+
+#include <omp-tools.h>
+#include "stdio.h"
+
+#ifdef SECOND_TOOL
+// The second tool has an implementation of ompt_start_tool that returns NULL
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  printf("0: Do not initialize tool\n");
+  return NULL;
+}
+#elif defined(THIRD_TOOL)
+// The third tool has an implementation of ompt_start_tool that returns a 
+// pointer to a valid instance of ompt_start_tool_result_t
+
+static void
+on_ompt_callback_thread_begin(
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data)
+{
+  printf("0: ompt_event_thread_begin\n");
+}
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_data_t *tool_data)
+{
+  ompt_set_callback_t ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+  ompt_set_callback(ompt_callback_thread_begin, (ompt_callback_t)on_ompt_callback_thread_begin);
+  printf("0: Tool initialized\n");
+  return 1;
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+  printf("0: Tool finalized\n");
+}
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  printf("0: Do initialize tool\n");
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
+#endif
+
+#endif /* TOOL */
diff --git a/final/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c b/final/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
new file mode 100644
index 0000000..ea40468
--- /dev/null
+++ b/final/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
@@ -0,0 +1,69 @@
+// The OpenMP standard defines 3 ways of providing ompt_start_tool:
+// 1. "statically-linking the tool’s definition of ompt_start_tool into an OpenMP application"
+// RUN: %libomp-compile -DCODE -DTOOL && %libomp-run | FileCheck %s
+
+// Note: We should compile the tool without -fopenmp as other tools developer
+//       would do. Otherwise this test may pass for the wrong reasons on Darwin.
+// RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
+// 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
+// 2.1 Link with tool during compilation
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.2 Link with tool during compilation, but AFTER the runtime
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
+// 2.3 Inject tool via the dynamic loader
+// RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
+
+// 3. "providing the name of a dynamically-linked library appropriate for the architecture and operating system used by the application in the tool-libraries-var ICV"
+// RUN: %libomp-compile -DCODE && env OMP_TOOL_LIBRARIES=%T/tool.so %libomp-run | FileCheck %s
+
+// REQUIRES: ompt
+
+/*
+ *  This file contains code for an OMPT shared library tool to be 
+ *  loaded and the code for the OpenMP executable. 
+ *  -DTOOL enables the code for the tool during compilation
+ *  -DCODE enables the code for the executable during compilation
+ */
+
+#ifdef CODE
+#include "stdio.h"
+#include "omp.h"
+#include "omp-tools.h"
+
+int main()
+{
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      int result = omp_control_tool(omp_control_tool_start, 0, NULL);
+      printf("0: control_tool()=%d\n", result);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 
+  
+  // CHECK: {{^}}0: Do not initialize tool
+  // CHECK: {{^}}0: control_tool()=-2
+  
+
+  return 0;
+}
+
+#endif /* CODE */
+
+#ifdef TOOL
+
+#include <omp-tools.h>
+#include "stdio.h"
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  printf("0: Do not initialize tool\n");
+  return NULL;
+}
+#endif /* TOOL */
diff --git a/final/runtime/test/ompt/misc/api_calls_from_other_thread.cpp b/final/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
new file mode 100644
index 0000000..e2ef1fc
--- /dev/null
+++ b/final/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
@@ -0,0 +1,92 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s
+// REQUIRES: ompt, linux
+
+#include <thread>
+#include "callback.h"
+
+void f() {
+  ompt_data_t *tdata = ompt_get_thread_data();
+  uint64_t tvalue = tdata ? tdata->value : 0;
+
+  printf("%" PRIu64 ": ompt_get_num_places()=%d\n", tvalue,
+         ompt_get_num_places());
+
+  printf("%" PRIu64 ": ompt_get_place_proc_ids()=%d\n", tvalue,
+         ompt_get_place_proc_ids(0, 0, NULL));
+
+  printf("%" PRIu64 ": ompt_get_place_num()=%d\n", tvalue,
+         ompt_get_place_num());
+
+  printf("%" PRIu64 ": ompt_get_partition_place_nums()=%d\n", tvalue,
+         ompt_get_partition_place_nums(0, NULL));
+
+  printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", tvalue, ompt_get_proc_id());
+
+  printf("%" PRIu64 ": ompt_get_num_procs()=%d\n", tvalue,
+         ompt_get_num_procs());
+
+  ompt_callback_t callback;
+  printf("%" PRIu64 ": ompt_get_callback()=%d\n", tvalue,
+         ompt_get_callback(ompt_callback_thread_begin, &callback));
+
+  printf("%" PRIu64 ": ompt_get_state()=%d\n", tvalue, ompt_get_state(NULL));
+
+  int state = ompt_state_undefined;
+  const char *state_name;
+  printf("%" PRIu64 ": ompt_enumerate_states()=%d\n", tvalue,
+         ompt_enumerate_states(state, &state, &state_name));
+
+  int impl = ompt_mutex_impl_none;
+  const char *impl_name;
+  printf("%" PRIu64 ": ompt_enumerate_mutex_impls()=%d\n", tvalue,
+         ompt_enumerate_mutex_impls(impl, &impl, &impl_name));
+
+  printf("%" PRIu64 ": ompt_get_thread_data()=%p\n", tvalue,
+         ompt_get_thread_data());
+
+  printf("%" PRIu64 ": ompt_get_parallel_info()=%d\n", tvalue,
+         ompt_get_parallel_info(0, NULL, NULL));
+
+  printf("%" PRIu64 ": ompt_get_task_info()=%d\n", tvalue,
+         ompt_get_task_info(0, NULL, NULL, NULL, NULL, NULL));
+}
+
+int main() {
+#pragma omp parallel num_threads(1)
+  {}
+
+  std::thread t1(f);
+  t1.join();
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_get_num_places()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_proc_ids()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_num()=-1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_partition_place_nums()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_proc_id()=-1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_procs()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_callback()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_state()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_states()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_mutex_impls()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_thread_data()=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_parallel_info()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_task_info()=0
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/api_calls_misc.c b/final/runtime/test/ompt/misc/api_calls_misc.c
new file mode 100644
index 0000000..884421e
--- /dev/null
+++ b/final/runtime/test/ompt/misc/api_calls_misc.c
@@ -0,0 +1,72 @@
+// RUN: %libomp-compile && %libomp-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+#pragma omp parallel num_threads(1)
+  {
+    // ompt_get_callback()
+    ompt_callback_t callback;
+    ompt_get_callback(ompt_callback_thread_begin, &callback);
+    printf("%" PRIu64 ": &on_ompt_callback_thread_begin=%p\n",
+           ompt_get_thread_data()->value, &on_ompt_callback_thread_begin);
+    printf("%" PRIu64 ": ompt_get_callback() result=%p\n",
+           ompt_get_thread_data()->value, callback);
+
+    // ompt_get_state()
+    printf("%" PRIu64 ": ompt_get_state()=%d\n", ompt_get_thread_data()->value,
+           ompt_get_state(NULL));
+
+    // ompt_enumerate_states()
+    int state = ompt_state_undefined;
+    const char *state_name;
+    int steps = 0;
+    while (ompt_enumerate_states(state, &state, &state_name) && steps < 1000) {
+      steps++;
+      if (!state_name)
+        printf("%" PRIu64 ": state_name is NULL\n",
+               ompt_get_thread_data()->value);
+    }
+    if (steps >= 1000) {
+      // enumeration did not end after 1000 steps
+      printf("%" PRIu64 ": states enumeration did not end\n",
+             ompt_get_thread_data()->value);
+    }
+
+    // ompt_enumerate_mutex_impls()
+    int impl = ompt_mutex_impl_none;
+    const char *impl_name;
+    steps = 0;
+    while (ompt_enumerate_mutex_impls(impl, &impl, &impl_name) &&
+           steps < 1000) {
+      steps++;
+      if (!impl_name)
+        printf("%" PRIu64 ": impl_name is NULL\n",
+               ompt_get_thread_data()->value);
+    }
+    if (steps >= 1000) {
+      // enumeration did not end after 1000 steps
+      printf("%" PRIu64 ": mutex_impls enumeration did not end\n",
+             ompt_get_thread_data()->value);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: &on_ompt_callback_thread_begin
+  // CHECK-SAME: =[[FUNCTION_POINTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_get_callback() result=[[FUNCTION_POINTER]]
+
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_get_state()=1
+
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: state_name is NULL
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: states enumeration did not end
+
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: impl_name is NULL
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: mutex_impls enumeration did not end
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/api_calls_places.c b/final/runtime/test/ompt/misc/api_calls_places.c
new file mode 100644
index 0000000..3385c9c
--- /dev/null
+++ b/final/runtime/test/ompt/misc/api_calls_places.c
@@ -0,0 +1,88 @@
+// RUN: %libomp-compile && env OMP_PLACES=cores %libomp-run | FileCheck %s
+// REQUIRES: ompt, linux
+#include "callback.h"
+#include <omp.h>
+#define __USE_GNU
+#include <sched.h>
+#undef __USE_GNU
+
+void print_list(char *function_name, int size, int list[]) {
+  printf("%" PRIu64 ": %s(0)=(%d", ompt_get_thread_data()->value, function_name,
+         list[0]);
+  int i;
+  for (i = 1; i < size; i++) {
+    printf(",%d", list[i]);
+  }
+  printf(")\n");
+}
+
+int main() {
+#pragma omp parallel num_threads(1)
+  {
+    printf("%" PRIu64 ": omp_get_num_places()=%d\n",
+           ompt_get_thread_data()->value, omp_get_num_places());
+    printf("%" PRIu64 ": ompt_get_num_places()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_num_places());
+
+    int omp_ids_size = omp_get_place_num_procs(0);
+    int omp_ids[omp_ids_size];
+    omp_get_place_proc_ids(0, omp_ids);
+    print_list("omp_get_place_proc_ids", omp_ids_size, omp_ids);
+    int ompt_ids_size = ompt_get_place_proc_ids(0, 0, NULL);
+    int ompt_ids[ompt_ids_size];
+    ompt_get_place_proc_ids(0, ompt_ids_size, ompt_ids);
+    print_list("ompt_get_place_proc_ids", ompt_ids_size, ompt_ids);
+
+    printf("%" PRIu64 ": omp_get_place_num()=%d\n",
+           ompt_get_thread_data()->value, omp_get_place_num());
+    printf("%" PRIu64 ": ompt_get_place_num()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_place_num());
+
+    int omp_nums_size = omp_get_partition_num_places();
+    int omp_nums[omp_nums_size];
+    omp_get_partition_place_nums(omp_nums);
+    print_list("omp_get_partition_place_nums", omp_nums_size, omp_nums);
+    int ompt_nums_size = ompt_get_partition_place_nums(0, omp_nums);
+    int ompt_nums[ompt_nums_size];
+    ompt_get_partition_place_nums(ompt_nums_size, ompt_nums);
+    print_list("ompt_get_partition_place_nums", ompt_nums_size, ompt_nums);
+
+    printf("%" PRIu64 ": sched_getcpu()=%d\n", ompt_get_thread_data()->value,
+           sched_getcpu());
+    printf("%" PRIu64 ": ompt_get_proc_id()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_proc_id());
+
+    printf("%" PRIu64 ": omp_get_num_procs()=%d\n",
+           ompt_get_thread_data()->value, omp_get_num_procs());
+    printf("%" PRIu64 ": ompt_get_num_procs()=%d\n",
+           ompt_get_thread_data()->value, ompt_get_num_procs());
+  }
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: omp_get_num_places
+  // CHECK-SAME: ()=[[NUM_PLACES:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_places()=[[NUM_PLACES]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_place_proc_ids
+  // CHECK-SAME: (0)=([[PROC_IDS:[0-9\,]+]])
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_proc_ids(0)=([[PROC_IDS]])
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_place_num()=[[PLACE_NUM:[-]?[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_num()=[[PLACE_NUM]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_partition_place_nums
+  // CHECK-SAME: (0)=([[PARTITION_PLACE_NUMS:[0-9\,]+]])
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_partition_place_nums
+  // CHECK-SAME: (0)=([[PARTITION_PLACE_NUMS]])
+
+  // CHECK: {{^}}[[MASTER_ID]]: sched_getcpu()=[[CPU_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_proc_id()=[[CPU_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: omp_get_num_procs()=[[NUM_PROCS:[-]?[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_procs()=[[NUM_PROCS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/api_calls_without_ompt.c b/final/runtime/test/ompt/misc/api_calls_without_ompt.c
new file mode 100644
index 0000000..e66aecd
--- /dev/null
+++ b/final/runtime/test/ompt/misc/api_calls_without_ompt.c
@@ -0,0 +1,148 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#define _BSD_SOURCE
+#define _DEFAULT_SOURCE
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <omp.h>
+#include <omp-tools.h>
+
+static ompt_set_callback_t ompt_set_callback;
+static ompt_get_callback_t ompt_get_callback;
+static ompt_get_state_t ompt_get_state;
+static ompt_get_task_info_t ompt_get_task_info;
+static ompt_get_thread_data_t ompt_get_thread_data;
+static ompt_get_parallel_info_t ompt_get_parallel_info;
+static ompt_get_unique_id_t ompt_get_unique_id;
+static ompt_get_num_procs_t ompt_get_num_procs;
+static ompt_get_num_places_t ompt_get_num_places;
+static ompt_get_place_proc_ids_t ompt_get_place_proc_ids;
+static ompt_get_place_num_t ompt_get_place_num;
+static ompt_get_partition_place_nums_t ompt_get_partition_place_nums;
+static ompt_get_proc_id_t ompt_get_proc_id;
+static ompt_enumerate_states_t ompt_enumerate_states;
+static ompt_enumerate_mutex_impls_t ompt_enumerate_mutex_impls;
+
+int main() {
+  // Call OpenMP API function to force initialization of OMPT.
+  // (omp_get_thread_num() does not work because it just returns 0 if the
+  // runtime isn't initialized yet...)
+  omp_get_num_threads();
+
+  ompt_data_t *tdata = ompt_get_thread_data();
+  uint64_t tvalue = tdata ? tdata->value : 0;
+
+  printf("%" PRIu64 ": ompt_get_num_places()=%d\n", tvalue,
+         ompt_get_num_places());
+
+  printf("%" PRIu64 ": ompt_get_place_proc_ids()=%d\n", tvalue,
+         ompt_get_place_proc_ids(0, 0, NULL));
+
+  printf("%" PRIu64 ": ompt_get_place_num()=%d\n", tvalue,
+         ompt_get_place_num());
+
+  printf("%" PRIu64 ": ompt_get_partition_place_nums()=%d\n", tvalue,
+         ompt_get_partition_place_nums(0, NULL));
+
+  printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", tvalue, ompt_get_proc_id());
+
+  printf("%" PRIu64 ": ompt_get_num_procs()=%d\n", tvalue,
+         ompt_get_num_procs());
+
+  ompt_callback_t callback;
+  printf("%" PRIu64 ": ompt_get_callback()=%d\n", tvalue,
+         ompt_get_callback(ompt_callback_thread_begin, &callback));
+
+  printf("%" PRIu64 ": ompt_get_state()=%d\n", tvalue, ompt_get_state(NULL));
+
+  int state = ompt_state_undefined;
+  const char *state_name;
+  printf("%" PRIu64 ": ompt_enumerate_states()=%d\n", tvalue,
+         ompt_enumerate_states(state, &state, &state_name));
+
+  int impl = ompt_mutex_impl_none;
+  const char *impl_name;
+  printf("%" PRIu64 ": ompt_enumerate_mutex_impls()=%d\n", tvalue,
+         ompt_enumerate_mutex_impls(impl, &impl, &impl_name));
+
+  printf("%" PRIu64 ": ompt_get_thread_data()=%p\n", tvalue,
+         ompt_get_thread_data());
+
+  printf("%" PRIu64 ": ompt_get_parallel_info()=%d\n", tvalue,
+         ompt_get_parallel_info(0, NULL, NULL));
+
+  printf("%" PRIu64 ": ompt_get_task_info()=%d\n", tvalue,
+         ompt_get_task_info(0, NULL, NULL, NULL, NULL, NULL));
+
+  // Check if libomp supports the callbacks for this test.
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_get_num_places()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_proc_ids()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_place_num()=-1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_partition_place_nums()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_proc_id()=-1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_procs()={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_callback()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_state()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_states()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_mutex_impls()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_thread_data()=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_parallel_info()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_task_info()=0
+
+  return 0;
+}
+
+int ompt_initialize(ompt_function_lookup_t lookup, ompt_data_t *tool_data) {
+  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+  ompt_get_callback = (ompt_get_callback_t)lookup("ompt_get_callback");
+  ompt_get_state = (ompt_get_state_t)lookup("ompt_get_state");
+  ompt_get_task_info = (ompt_get_task_info_t)lookup("ompt_get_task_info");
+  ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
+  ompt_get_parallel_info =
+      (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
+  ompt_get_unique_id = (ompt_get_unique_id_t)lookup("ompt_get_unique_id");
+
+  ompt_get_num_procs = (ompt_get_num_procs_t)lookup("ompt_get_num_procs");
+  ompt_get_num_places = (ompt_get_num_places_t)lookup("ompt_get_num_places");
+  ompt_get_place_proc_ids =
+      (ompt_get_place_proc_ids_t)lookup("ompt_get_place_proc_ids");
+  ompt_get_place_num = (ompt_get_place_num_t)lookup("ompt_get_place_num");
+  ompt_get_partition_place_nums =
+      (ompt_get_partition_place_nums_t)lookup("ompt_get_partition_place_nums");
+  ompt_get_proc_id = (ompt_get_proc_id_t)lookup("ompt_get_proc_id");
+  ompt_enumerate_states =
+      (ompt_enumerate_states_t)lookup("ompt_enumerate_states");
+  ompt_enumerate_mutex_impls =
+      (ompt_enumerate_mutex_impls_t)lookup("ompt_enumerate_mutex_impls");
+
+  printf("0: NULL_POINTER=%p\n", (void *)NULL);
+  return 0; // no success -> OMPT not enabled
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+                                                            &ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
diff --git a/final/runtime/test/ompt/misc/control_tool.c b/final/runtime/test/ompt/misc/control_tool.c
new file mode 100644
index 0000000..0c3c1b0
--- /dev/null
+++ b/final/runtime/test/ompt/misc/control_tool.c
@@ -0,0 +1,29 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+    print_frame_from_outlined_fn(1);
+    print_frame(0);
+    omp_control_tool(omp_control_tool_flush, 1, NULL);
+    print_current_address(0);
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_control_tool'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address({{.}})=[[EXIT_FRAME:0x[0-f]*]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER_FRAME:0x[0-f]*]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_control_tool: command=3, modifier=1, arg=[[NULL]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]*]], current_task_frame.exit=[[EXIT_FRAME]], current_task_frame.reenter={{0x[0-f]*}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/control_tool_no_ompt_support.c b/final/runtime/test/ompt/misc/control_tool_no_ompt_support.c
new file mode 100644
index 0000000..bcfe3ca
--- /dev/null
+++ b/final/runtime/test/ompt/misc/control_tool_no_ompt_support.c
@@ -0,0 +1,13 @@
+// RUN: %libomp-compile-and-run
+
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+    omp_control_tool(omp_control_tool_flush, 1, NULL);
+  }
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/finalize_tool.c b/final/runtime/test/ompt/misc/finalize_tool.c
new file mode 100644
index 0000000..0a406c8
--- /dev/null
+++ b/final/runtime/test/ompt/misc/finalize_tool.c
@@ -0,0 +1,28 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main() {
+#pragma omp parallel num_threads(2)
+  {}
+
+  printf("Before ompt_finalize_tool\n");
+  ompt_finalize_tool();
+  printf("After ompt_finalize_tool\n");
+
+  return 0;
+}
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+// CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1
+
+// CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin
+// CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_end
+
+// CHECK: {{^}}Before ompt_finalize_tool
+
+// CHECK: {{^}}[[THREAD_ID]]: ompt_event_thread_end: thread_id=[[THREAD_ID]]
+// CHECK: 0: ompt_event_runtime_shutdown
+
+// CHECK: {{^}}After ompt_finalize_tool
diff --git a/final/runtime/test/ompt/misc/interoperability.cpp b/final/runtime/test/ompt/misc/interoperability.cpp
new file mode 100644
index 0000000..cbb0e87
--- /dev/null
+++ b/final/runtime/test/ompt/misc/interoperability.cpp
@@ -0,0 +1,120 @@
+// RUN: %libomp-cxx-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+#include <iostream>
+#include <thread>
+#if !defined(__FreeBSD__) && !defined(__NetBSD__)
+#include <alloca.h>
+#else
+#include <cstdlib>
+#endif
+
+#include "callback.h"
+#include "omp.h"
+
+int condition = 0;
+
+void f() {
+  // Call OpenMP API function to force initialization of OMPT.
+  // (omp_get_thread_num() does not work because it just returns 0 if the
+  // runtime isn't initialized yet...)
+  omp_get_num_threads();
+
+  // Call alloca() to force availability of frame pointer
+  void *p = alloca(0);
+
+  OMPT_SIGNAL(condition);
+  // Wait for both initial threads to arrive that will eventually become the
+  // master threads in the following parallel region.
+  OMPT_WAIT(condition, 2);
+
+#pragma omp parallel num_threads(2)
+  {
+    // Wait for all threads to arrive so that no worker thread can be reused...
+    OMPT_SIGNAL(condition);
+    OMPT_WAIT(condition, 6);
+  }
+}
+
+int main() {
+  std::thread t1(f);
+  std::thread t2(f);
+  t1.join();
+  t2.join();
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// first master thread
+// CHECK: {{^}}[[MASTER_ID_1:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID_1]]
+
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}
+// CHECK-SAME: task_id=[[PARENT_TASK_ID_1:[0-9]+]], actual_parallelism=1, index=1, flags=1 
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID_1]]
+// CHECK-SAME: parent_task_frame.exit=[[NULL]]
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]], requested_team_size=2
+// CHECK-SAME: codeptr_ra=0x{{[0-f]+}}, invoker={{.*}}
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_1]], task_id=[[PARENT_TASK_ID_1]]
+// CHECK-SAME: invoker={{[0-9]+}}
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_initial_task_end:
+// CHECK-SAME: parallel_id={{[0-9]+}}, task_id=[[PARENT_TASK_ID_1]],
+// CHECK-SAME: team_size=0, thread_num=1
+
+// CHECK: {{^}}[[MASTER_ID_1]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[MASTER_ID_1]]
+
+// second master thread
+// CHECK: {{^}}[[MASTER_ID_2:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID_2]]
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}
+// CHECK-SAME: task_id=[[PARENT_TASK_ID_2:[0-9]+]], actual_parallelism=1, index=1, flags=1 
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID_2]]
+// CHECK-SAME: parent_task_frame.exit=[[NULL]]
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_2:[0-9]+]]
+// CHECK-SAME: requested_team_size=2, codeptr_ra=0x{{[0-f]+}}
+// CHECK-SAME: invoker={{.*}}
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_parallel_end:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID_2]], task_id=[[PARENT_TASK_ID_2]]
+// CHECK-SAME: invoker={{[0-9]+}}
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_initial_task_end:
+// CHECK-SAME: parallel_id={{[0-9]+}}, task_id=[[PARENT_TASK_ID_2]],
+// CHECK-SAME: team_size=0, thread_num=1
+
+// CHECK: {{^}}[[MASTER_ID_2]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[MASTER_ID_2]]
+
+// first worker thread
+// CHECK: {{^}}[[THREAD_ID_1:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID_1]]
+// CHECK-NOT: {{^}}[[THREAD_ID_1:[0-9]+]]: ompt_event_initial_task_end:
+
+// CHECK: {{^}}[[THREAD_ID_1]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[THREAD_ID_1]]
+
+// second worker thread
+// CHECK: {{^}}[[THREAD_ID_2:[0-9]+]]: ompt_event_thread_begin:
+// CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID_2]]
+
+// CHECK: {{^}}[[THREAD_ID_2]]: ompt_event_thread_end:
+// CHECK-SAME: thread_id=[[THREAD_ID_2]]
diff --git a/final/runtime/test/ompt/misc/threads.c b/final/runtime/test/ompt/misc/threads.c
new file mode 100644
index 0000000..4a0fc6f
--- /dev/null
+++ b/final/runtime/test/ompt/misc/threads.c
@@ -0,0 +1,34 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp atomic
+    x++;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/threads_nested.c b/final/runtime/test/ompt/misc/threads_nested.c
new file mode 100644
index 0000000..0d38dcf
--- /dev/null
+++ b/final/runtime/test/ompt/misc/threads_nested.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+
+  int condition = 0;
+  int x = 0;
+  omp_set_nested(1);
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp parallel num_threads(2)
+    {
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+    }
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/misc/unset_callback.c b/final/runtime/test/ompt/misc/unset_callback.c
new file mode 100644
index 0000000..9074ad3
--- /dev/null
+++ b/final/runtime/test/ompt/misc/unset_callback.c
@@ -0,0 +1,29 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel num_threads(1)
+  {
+
+  }
+  ompt_set_callback(ompt_callback_parallel_begin, NULL);
+  #pragma omp parallel num_threads(1)
+  {
+
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_idle'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/ompt-signal.h b/final/runtime/test/ompt/ompt-signal.h
new file mode 100644
index 0000000..b5c28cf
--- /dev/null
+++ b/final/runtime/test/ompt/ompt-signal.h
@@ -0,0 +1,31 @@
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#else
+#include <unistd.h>
+#define delay(t) usleep(t);
+#endif
+
+// These functions are used to provide a signal-wait mechanism to enforce expected scheduling for the test cases.
+// Conditional variable (s) needs to be shared! Initialize to 0
+
+#define OMPT_SIGNAL(s) ompt_signal(&s)
+//inline 
+void ompt_signal(int* s) 
+{                
+  #pragma omp atomic
+  (*s)++;
+}
+                
+#define OMPT_WAIT(s,v) ompt_wait(&s,v)
+// wait for s >= v
+//inline 
+void ompt_wait(int *s, int v)
+{
+  int wait=0;
+  do{
+    delay(10);
+    #pragma omp atomic read
+	  wait = (*s);
+  }while(wait<v);
+}
diff --git a/final/runtime/test/ompt/parallel/dynamic_enough_threads.c b/final/runtime/test/ompt/parallel/dynamic_enough_threads.c
new file mode 100644
index 0000000..4c340ba
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/dynamic_enough_threads.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_dynamic(1);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+  //team-size of 1-4 is expected
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[1-4]}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/dynamic_not_enough_threads.c b/final/runtime/test/ompt/parallel/dynamic_not_enough_threads.c
new file mode 100644
index 0000000..f3a6e17
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/dynamic_not_enough_threads.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_dynamic(1);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+  //team-size of 1-4 is expected
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[1-4]}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/max_active_levels_serialized.c b/final/runtime/test/ompt/parallel/max_active_levels_serialized.c
new file mode 100644
index 0000000..bbe73ef
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/max_active_levels_serialized.c
@@ -0,0 +1,73 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_set_nested(1);
+  omp_set_max_active_levels(1);
+
+  #pragma omp parallel num_threads(2)
+  {
+    print_ids(0);
+    print_ids(1);
+    #pragma omp parallel num_threads(2)
+    {
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested.c b/final/runtime/test/ompt/parallel/nested.c
new file mode 100644
index 0000000..d91597b
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested.c
@@ -0,0 +1,298 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(1);
+  print_frame(0);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+
+    //get all implicit task events before starting nested:
+    #pragma omp barrier
+    
+    #pragma omp parallel num_threads(4)
+    {
+      print_frame_from_outlined_fn(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      print_frame(0);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition,16);
+      #pragma omp barrier
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_fuzzy_address(2);
+    print_ids(0);
+  }
+  print_fuzzy_address(3);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+
+  // THREADS: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=0x{{[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // nested parallel masters
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=0x{{[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[NESTED_EXIT:0x[0-f]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // THREADS: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // explicit barrier
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=0x{{[0-f]+}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+  // implicit barrier
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // implicit barrier
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // nested parallel worker threads
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested_lwt.c b/final/runtime/test/ompt/parallel/nested_lwt.c
new file mode 100644
index 0000000..8348376
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested_lwt.c
@@ -0,0 +1,334 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main()
+{
+  omp_set_nested(1);
+  int condition = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+    //get all implicit task events before starting nested:
+    #pragma omp barrier
+    #pragma omp parallel num_threads(1)
+    {
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      //get all implicit task events before starting nested:
+      #pragma omp barrier
+      #pragma omp parallel num_threads(4)
+      {
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition,16);
+      }
+      print_fuzzy_address(1);
+    }
+    print_fuzzy_address(2);
+  }
+  print_fuzzy_address(3);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=[[TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // nested parallel masters
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit=[[NESTED_NESTED_TASK_FRAME_EXIT]], parent_task_frame.reenter=[[NESTED_NESTED_TASK_FRAME_ENTER:0x[0-f]+]], parallel_id=[[NESTED_NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_NESTED_TASK_FRAME_EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame=[[NESTED_NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NESTED_TASK_FRAME_EXIT]], reenter_frame=[[NESTED_TASK_FRAME_ENTER]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // nested parallel worker threads
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // can't reliably tell which parallel region is the parent...
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}
+  // THREADS: {{^}}[[THREAD_ID]]: task level 3: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[TASK_FRAME_ENTER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested_serialized.c b/final/runtime/test/ompt/parallel/nested_serialized.c
new file mode 100644
index 0000000..f87b8f4
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested_serialized.c
@@ -0,0 +1,128 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_set_nested(0);
+
+  #pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+    #pragma omp parallel num_threads(4)
+    {
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/nested_thread_num.c b/final/runtime/test/ompt/parallel/nested_thread_num.c
new file mode 100644
index 0000000..f14f87a
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested_thread_num.c
@@ -0,0 +1,357 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main() {
+  int condition = 0;
+  omp_set_nested(1);
+  print_frame(0);
+
+#pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+
+// get all implicit task events before starting nested:
+#pragma omp barrier
+
+#pragma omp parallel num_threads(2)
+    {
+      print_frame_from_outlined_fn(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      print_frame(0);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+#pragma omp barrier
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_fuzzy_address(2);
+    print_ids(0);
+  }
+  print_fuzzy_address(3);
+
+  return 0;
+}
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// make sure initial data pointers are null
+// CHECK-NOT: 0: parallel_data initially not null
+// CHECK-NOT: 0: task_data initially not null
+// CHECK-NOT: 0: thread_data initially not null
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]],
+// CHECK-SAME: parent_task_frame.exit=[[NULL]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=2,
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+
+// Note that we cannot ensure that the worker threads have already called
+// barrier_end and implicit_task_end before parallel_end!
+
+// CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+
+
+// CHECK: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], 
+// CHECK-SAME: task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// THREADS: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+// THREADS: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+// THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], 
+// THREADS-SAME: parent_task_frame.exit=[[NULL]],
+// THREADS-SAME: parent_task_frame.reenter=0x{{[0-f]+}},
+// THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// nested parallel masters
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]],
+// THREADS-SAME: team_size=2, thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=0x{{[0-f]+}}
+
+// THREADS: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit=[[EXIT]],
+// THREADS-SAME: parent_task_frame.reenter=0x{{[0-f]+}},
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[NESTED_EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// THREADS-SAME: reenter_frame=0x{{[0-f]+}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=0x{{[0-f]+}}
+
+// THREADS: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+// explicit barrier
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=0x{{[0-f]+}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]], 
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// Worker of first nesting level
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// THREADS-SAME: thread_num=[[INNER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: thread_num=[[INNER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// other nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
diff --git a/final/runtime/test/ompt/parallel/nested_threadnum.c b/final/runtime/test/ompt/parallel/nested_threadnum.c
new file mode 100644
index 0000000..a248530
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/nested_threadnum.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include <omp.h>
+#include "callback.h"
+
+int main() {
+  omp_set_nested(1);
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp barrier
+#pragma omp parallel num_threads(2)
+    { print_frame(0); }
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: thread_num=[[OUTER_THREAD_NUM1:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[INNER_PARALLEL_ID1:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[INNER_PARALLEL_ID1]]
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM1:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+  // CHECK-SAME: parallel_id=[[INNER_PARALLEL_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // CHECK-SAME: thread_num=[[OUTER_THREAD_NUM1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: thread_num=[[OUTER_THREAD_NUM2:[0-9]+]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parallel_id=[[INNER_PARALLEL_ID2:[0-9]+]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[INNER_PARALLEL_ID2]]
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM2:[0-9]+]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_implicit_task_end
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM2]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_parallel_end:
+  // CHECK-SAME: parallel_id=[[INNER_PARALLEL_ID2]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_implicit_task_end
+  // CHECK-SAME: thread_num=[[OUTER_THREAD_NUM2]]
+
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM3:[0-9]+]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_implicit_task_end
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM3]]
+
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM4:[0-9]+]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_implicit_task_end
+  // CHECK-SAME: thread_num=[[INNER_THREAD_NUM4]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/no_thread_num_clause.c b/final/runtime/test/ompt/parallel/no_thread_num_clause.c
new file mode 100644
index 0000000..5583036
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/no_thread_num_clause.c
@@ -0,0 +1,96 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main()
+{
+  omp_set_num_threads(4);
+  #pragma omp parallel
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, actual_parallelism=1, index=1, flags=1 
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/normal.c b/final/runtime/test/ompt/parallel/normal.c
new file mode 100644
index 0000000..2cc9ce1
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/normal.c
@@ -0,0 +1,132 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads \
+// RUN:                         | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+#include "callback.h"
+
+int main() {
+#pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // Only check callback names, arguments are verified in THREADS below.
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+  // Note that we cannot ensure that the worker threads have already called
+  // barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]]
+  // THREADS-SAME: parent_task_frame.exit=[[NULL]]
+  // THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/not_enough_threads.c b/final/runtime/test/ompt/parallel/not_enough_threads.c
new file mode 100644
index 0000000..8a0469a
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/not_enough_threads.c
@@ -0,0 +1,90 @@
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | FileCheck %s
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | %sort-threads \
+// RUN:     | FileCheck --check-prefix=THREADS %s
+
+// REQUIRES: ompt
+
+#include "callback.h"
+
+int main() {
+#pragma omp parallel num_threads(4)
+  {
+    print_ids(0);
+    print_ids(1);
+  }
+  print_fuzzy_address(1);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // Make sure initial data pointers are null.
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // Only check callback names, arguments are verified in THREADS below.
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin
+
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+  // Note that we cannot ensure that the worker threads have already called
+  // barrier_end and implicit_task_end before parallel_end!
+
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
+
+  // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]]
+  // THREADS-SAME: parent_task_frame.exit=[[NULL]]
+  // THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+  // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+  // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/parallel_if0.c b/final/runtime/test/ompt/parallel/parallel_if0.c
new file mode 100644
index 0000000..63d6701
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/parallel_if0.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+
+int main()
+{
+//  print_frame(0);
+  #pragma omp parallel if(0)
+  {
+//    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+//    print_frame(0);
+    #pragma omp parallel if(0)
+    {
+//      print_frame(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+//      print_frame(0);
+      #pragma omp task
+      {
+//        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      }
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_end'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[NESTED_IMPLICIT_TASK_ID]], second_task_id=[[EXPLICIT_TASK_ID]], prior_task_status=ompt_task_switch=7
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[EXPLICIT_TASK_ID]], second_task_id=[[NESTED_IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[EXPLICIT_TASK_ID]]
+
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/parallel/serialized.c b/final/runtime/test/ompt/parallel/serialized.c
new file mode 100644
index 0000000..2be17dc
--- /dev/null
+++ b/final/runtime/test/ompt/parallel/serialized.c
@@ -0,0 +1,77 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+
+int main()
+{
+//  print_frame(0);
+  #pragma omp parallel num_threads(1)
+  {
+//    print_frame(1);
+    print_ids(0);
+    print_ids(1);
+//    print_frame(0);
+    #pragma omp parallel num_threads(1)
+    {
+//      print_frame(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+//      print_frame(0);
+      #pragma omp task
+      {
+//        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        print_ids(3);
+      }
+    }
+    print_fuzzy_address(1);
+  }
+  print_fuzzy_address(2);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_event_implicit_task_end'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: parallel_data initially not null
+  // CHECK-NOT: 0: task_data initially not null
+  // CHECK-NOT: 0: thread_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[OUTER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=[[INNER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker=[[PARALLEL_INVOKER]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame={{0x[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[NESTED_IMPLICIT_TASK_ID]], parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[NESTED_IMPLICIT_TASK_ID]], second_task_id=[[EXPLICIT_TASK_ID]], prior_task_status=ompt_task_switch=7
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[NESTED_IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}, reenter_frame={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[EXPLICIT_TASK_ID]], second_task_id=[[NESTED_IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[EXPLICIT_TASK_ID]]
+
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[NESTED_PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[INNER_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[INNER_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]], codeptr_ra=[[OUTER_RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[OUTER_RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/explicit.c b/final/runtime/test/ompt/synchronization/barrier/explicit.c
new file mode 100644
index 0000000..d60acd6
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/explicit.c
@@ -0,0 +1,58 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+
+    #pragma omp barrier
+    print_current_address();
+
+    #pragma omp atomic
+    x++;
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread explicit barrier 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+
+
+  // worker thread explicit barrier 
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[THREAD_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/for_loop.c b/final/runtime/test/ompt/synchronization/barrier/for_loop.c
new file mode 100644
index 0000000..5259447
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/for_loop.c
@@ -0,0 +1,56 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int y[] = {0,1,2,3};
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier at end of for loop
+    int i;
+    #pragma omp for
+    for (i = 0; i < 4; i++)
+    {
+      y[i]++;
+    }
+    print_current_address();
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at loop end 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread explicit barrier 
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // worker thread implicit barrier after parallel
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/for_simd.c b/final/runtime/test/ompt/synchronization/barrier/for_simd.c
new file mode 100644
index 0000000..351b2c2
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/for_simd.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// XFAIL: gcc-4
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int y[] = {0,1,2,3};
+
+  int i;
+  #pragma omp for simd
+  for (i = 0; i < 4; i++)
+  {
+    y[i]++;
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at simd loop end 
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/implicit_task_data.c b/final/runtime/test/ompt/synchronization/barrier/implicit_task_data.c
new file mode 100644
index 0000000..71c2b15
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/implicit_task_data.c
@@ -0,0 +1,154 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+
+// This test checks that values stored in task_data in a barrier_begin event
+// are still present in the corresponding barrier_end event.
+// Therefore, callback implementations different from the ones in callback.h are neccessary.
+// This is a test for an issue reported in 
+// https://github.com/OpenMPToolsInterface/LLVM-openmp/issues/39
+
+#define _BSD_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <omp.h>
+#include <omp-tools.h>
+
+static const char* ompt_thread_t_values[] = {
+  NULL,
+  "ompt_thread_initial",
+  "ompt_thread_worker",
+  "ompt_thread_other"
+};
+
+static ompt_get_unique_id_t ompt_get_unique_id;
+static ompt_get_thread_data_t ompt_get_thread_data;
+
+int main()
+{
+  #pragma omp parallel num_threads(4)
+  {
+    #pragma omp master
+    {
+      sleep(1);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]*}}
+
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id=0, task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=0, task_id=[[TASK_ID]], codeptr_ra=[[NULL]]
+
+  return 0;
+}
+
+static void
+on_ompt_callback_thread_begin(
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data)
+{
+  if(thread_data->ptr)
+    printf("%s\n", "0: thread_data initially not null");
+  thread_data->value = ompt_get_unique_id();
+  printf("%" PRIu64 ": ompt_event_thread_begin: thread_type=%s=%d, thread_id=%" PRIu64 "\n", ompt_get_thread_data()->value, ompt_thread_t_values[thread_type], thread_type, thread_data->value);
+}
+
+static void
+on_ompt_callback_sync_region(
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      task_data->value = ompt_get_unique_id();
+      if (kind == ompt_sync_region_barrier_implicit)
+        printf("%" PRIu64 ": ompt_event_barrier_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      if (kind == ompt_sync_region_barrier_implicit)
+        printf("%" PRIu64 ": ompt_event_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+      break;
+  }
+}
+
+static void
+on_ompt_callback_sync_region_wait(
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra)
+{
+  switch(endpoint)
+  {
+    case ompt_scope_begin:
+      if (kind == ompt_sync_region_barrier_implicit)
+        printf("%" PRIu64
+               ": ompt_event_wait_barrier_begin: parallel_id=%" PRIu64
+               ", task_id=%" PRIu64 ", codeptr_ra=%p\n",
+               ompt_get_thread_data()->value, parallel_data->value,
+               task_data->value, codeptr_ra);
+      break;
+    case ompt_scope_end:
+      if (kind == ompt_sync_region_barrier_implicit)
+        printf("%" PRIu64 ": ompt_event_wait_barrier_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra);
+      break;
+  }
+}
+
+#define register_callback_t(name, type)                       \
+do{                                                           \
+  type f_##name = &on_##name;                                 \
+  if (ompt_set_callback(name, (ompt_callback_t)f_##name) ==   \
+      ompt_set_never)                                         \
+    printf("0: Could not register callback '" #name "'\n");   \
+}while(0)
+
+#define register_callback(name) register_callback_t(name, name##_t)
+
+int ompt_initialize(
+  ompt_function_lookup_t lookup,
+  ompt_data_t *tool_data)
+{
+  ompt_set_callback_t ompt_set_callback;
+  ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
+  ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
+  ompt_get_thread_data = (ompt_get_thread_data_t) lookup("ompt_get_thread_data");
+  register_callback(ompt_callback_sync_region);
+  register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t);
+  register_callback(ompt_callback_thread_begin);
+  printf("0: NULL_POINTER=%p\n", (void*)NULL);
+  return 1; //success
+}
+
+void ompt_finalize(ompt_data_t *tool_data)
+{
+  printf("0: ompt_event_runtime_shutdown\n");
+}
+
+ompt_start_tool_result_t* ompt_start_tool(
+  unsigned int omp_version,
+  const char *runtime_version)
+{
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/parallel_region.c b/final/runtime/test/ompt/synchronization/barrier/parallel_region.c
new file mode 100644
index 0000000..ea0a23f
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/parallel_region.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  //implicit barrier at end of a parallel region
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp atomic
+    x++;
+  }
+  print_fuzzy_address();
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/sections.c b/final/runtime/test/ompt/synchronization/barrier/sections.c
new file mode 100644
index 0000000..4e1dfdd
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/sections.c
@@ -0,0 +1,63 @@
+// RUN: %libomp-compile-and-run | %sort-threads  | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier after sections with nowait but with lastprivates
+    //implicit barrier at end of sections
+    #pragma omp sections
+    {
+      #pragma omp section 
+      {
+        #pragma omp atomic
+        x++;
+      }
+      
+      #pragma omp section 
+      {
+        #pragma omp atomic
+        x++;
+      }
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at sections end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread implicit barrier at sections end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/barrier/single.c b/final/runtime/test/ompt/synchronization/barrier/single.c
new file mode 100644
index 0000000..8ba8b52
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/barrier/single.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+
+  #pragma omp parallel num_threads(2)
+  {
+    //implicit barrier at end of single
+    #pragma omp single
+    {
+      x++;
+    }
+    print_fuzzy_address();
+    //critical section to avoid merge of two barriers into one
+    #pragma omp critical
+    {
+      x++;
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // master thread implicit barrier at single end
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // master thread implicit barrier at parallel end
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+
+
+  // worker thread implicit barrier at single end
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // worker thread implicit barrier at parallel end
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_wait_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[NULL]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/critical.c b/final/runtime/test/ompt/synchronization/critical.c
new file mode 100644
index 0000000..ed982b7
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/critical.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp critical
+  {
+    print_current_address(1);
+    print_ids(0);
+  }
+  print_current_address(2);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_critical: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_critical: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_critical: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/flush.c b/final/runtime/test/ompt/synchronization/flush.c
new file mode 100644
index 0000000..287d035
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/flush.c
@@ -0,0 +1,30 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the flush construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+#pragma omp parallel num_threads(2)
+  {
+    int tid = omp_get_thread_num();
+
+#pragma omp flush
+    print_current_address(1);
+  }
+
+  return 0;
+}
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_flush'
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_flush:
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+// CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+//
+// CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_flush:
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+// CHECK: {{^}}[[THREAD_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
diff --git a/final/runtime/test/ompt/synchronization/lock.c b/final/runtime/test/ompt/synchronization/lock.c
new file mode 100644
index 0000000..2a934ee
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/lock.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  //need to use an OpenMP construct so that OMPT will be initalized
+  #pragma omp parallel num_threads(1)
+    print_ids(0);
+
+  omp_lock_t lock;
+  printf("%" PRIu64 ": &lock: %" PRIu64 "\n", ompt_get_thread_data()->value, (ompt_wait_id_t)(uintptr_t) &lock);
+  omp_init_lock(&lock);
+  print_fuzzy_address(1);
+  omp_set_lock(&lock);
+  print_fuzzy_address(2);
+  omp_unset_lock(&lock);
+  print_fuzzy_address(3);
+  omp_destroy_lock(&lock);
+  print_fuzzy_address(4);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: &lock: [[WAIT_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_init_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+ 
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/master.c b/final/runtime/test/ompt/synchronization/master.c
new file mode 100644
index 0000000..8cc2d46
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/master.c
@@ -0,0 +1,38 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the master construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp master
+    {
+      print_fuzzy_address(1);
+      x++;
+    }
+    print_current_address(2);
+  }
+
+  printf("%" PRIu64 ": x=%d\n", ompt_get_thread_data()->value, x);
+
+  return 0;
+}
+
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_master_begin:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+// CHECK: {{^}}[[MASTER_ID]]: ompt_event_master_end:
+// CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]],
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS_END:0x[0-f]+]]
+// CHECK: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS_END]]
diff --git a/final/runtime/test/ompt/synchronization/nest_lock.c b/final/runtime/test/ompt/synchronization/nest_lock.c
new file mode 100644
index 0000000..159048e
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/nest_lock.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  //need to use an OpenMP construct so that OMPT will be initalized
+  #pragma omp parallel num_threads(1)
+    print_ids(0);
+
+  omp_nest_lock_t nest_lock;
+  printf("%" PRIu64 ": &nest_lock: %lli\n", ompt_get_thread_data()->value, (ompt_wait_id_t)(uintptr_t) &nest_lock);
+  omp_init_nest_lock(&nest_lock);
+  print_fuzzy_address(1);
+  omp_set_nest_lock(&nest_lock);
+  print_fuzzy_address(2);
+  omp_set_nest_lock(&nest_lock);
+  print_fuzzy_address(3);
+  omp_unset_nest_lock(&nest_lock);
+  print_fuzzy_address(4);
+  omp_unset_nest_lock(&nest_lock);
+  print_fuzzy_address(5);
+  omp_destroy_nest_lock(&nest_lock);
+  print_fuzzy_address(6);
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_nest_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/ordered.c b/final/runtime/test/ompt/synchronization/ordered.c
new file mode 100644
index 0000000..14284a4
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/ordered.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp ordered
+  {
+    print_current_address(1);
+    print_ids(0);
+  }
+  print_current_address(2);
+  
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_ordered: wait_id=[[WAIT_ID:[0-9]+]], hint={{[0-9]+}}, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_ordered: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_ordered: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/taskgroup.c b/final/runtime/test/ompt/synchronization/taskgroup.c
new file mode 100644
index 0000000..7309c0a
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/taskgroup.c
@@ -0,0 +1,49 @@
+// RUN:  %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#include "callback.h"
+#include <unistd.h>  
+#include <stdio.h>
+
+int main()
+{
+  int condition=0;
+  int x=0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp taskgroup
+      {
+        print_current_address(1);
+        #pragma omp task
+        {
+          #pragma omp atomic
+          x++;
+        }
+      }
+      print_current_address(2);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_master'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_cancel'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskgroup_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/taskwait.c b/final/runtime/test/ompt/synchronization/taskwait.c
new file mode 100644
index 0000000..c431024
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/taskwait.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      #pragma omp task
+      {
+        x++;
+      }
+      #pragma omp taskwait
+      print_current_address(1);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_sync_region_wait'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskwait_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskwait_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskwait_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: ompt_event_taskwait_end: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: current_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/test_lock.c b/final/runtime/test/ompt/synchronization/test_lock.c
new file mode 100644
index 0000000..d24e4d6
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/test_lock.c
@@ -0,0 +1,54 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_lock_t lock;
+  omp_init_lock(&lock);
+  print_fuzzy_address(1);
+
+  omp_test_lock(&lock);
+  print_fuzzy_address(2);
+  omp_unset_lock(&lock);
+  print_fuzzy_address(3);
+
+  omp_set_lock(&lock);
+  print_fuzzy_address(4);
+  omp_test_lock(&lock);
+  print_fuzzy_address(5);
+  omp_unset_lock(&lock);
+  print_fuzzy_address(6);
+
+  omp_destroy_lock(&lock);
+  print_fuzzy_address(7);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_lock: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/test_nest_lock.c b/final/runtime/test/ompt/synchronization/test_nest_lock.c
new file mode 100644
index 0000000..ad02d32
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/test_nest_lock.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_nest_lock_t nest_lock;
+  omp_init_nest_lock(&nest_lock);
+
+  omp_test_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+
+  omp_set_nest_lock(&nest_lock);
+  omp_test_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+  omp_unset_nest_lock(&nest_lock);
+
+  omp_destroy_nest_lock(&nest_lock);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_init_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra={{0x[0-f]+}}  
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/synchronization/test_nest_lock_parallel.c b/final/runtime/test/ompt/synchronization/test_nest_lock_parallel.c
new file mode 100644
index 0000000..e9240f7
--- /dev/null
+++ b/final/runtime/test/ompt/synchronization/test_nest_lock_parallel.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  omp_nest_lock_t nest_lock;
+  omp_init_nest_lock(&nest_lock);
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+      omp_set_nest_lock(&nest_lock);
+      print_fuzzy_address(1);
+    }
+    #pragma omp barrier
+    omp_test_nest_lock(&nest_lock); //should fail for non-master
+    print_fuzzy_address(2);
+    #pragma omp barrier
+    #pragma omp master
+    {
+      omp_unset_nest_lock(&nest_lock);
+      print_fuzzy_address(3);
+      omp_unset_nest_lock(&nest_lock);
+      print_fuzzy_address(4);
+    }
+  }
+
+  omp_destroy_nest_lock(&nest_lock);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_nest_lock'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID:[0-9]+]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_first: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_prev: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_release_nest_lock_last: wait_id=[[WAIT_ID]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NEXT: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_destroy_nest_lock: wait_id=[[WAIT_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_wait_nest_lock: wait_id=[[WAIT_ID]], hint=0, impl={{[0-9]+}}, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-NOT: {{^}}[[THREAD_ID]]: ompt_event_acquired_nest_lock_next: wait_id=[[WAIT_ID]]
+  // CHECK-NEXT: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/dependences.c b/final/runtime/test/ompt/tasks/dependences.c
new file mode 100644
index 0000000..57b61f9
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/dependences.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#include "callback.h"
+#include <omp.h>   
+#include <math.h>
+#include <unistd.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {  
+      print_ids(0);
+      #pragma omp task depend(out:x)
+      {
+        x++;
+        delay(100);
+      }
+      print_fuzzy_address(1);
+      print_ids(0);
+    
+      #pragma omp task depend(in:x)
+      {
+        x = -1;
+      }
+      print_ids(0);
+    }
+  }
+
+  x++;
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependences'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_dependence'
+  
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT:0x[0-f]+]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[FIRST_TASK:[0-f]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, task_type=ompt_task_explicit=4, has_dependences=yes
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependences: task_id=[[FIRST_TASK]], deps={{0x[0-f]+}}, ndeps=1
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[SECOND_TASK:[0-f]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=yes
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependences: task_id=[[SECOND_TASK]], deps={{0x[0-f]+}}, ndeps=1
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_dependence_pair: first_task_id=[[FIRST_TASK]], second_task_id=[[SECOND_TASK]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/explicit_task.c b/final/runtime/test/ompt/tasks/explicit_task.c
new file mode 100644
index 0000000..a986c48
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/explicit_task.c
@@ -0,0 +1,102 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      print_fuzzy_address(1);
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    #pragma omp barrier
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK--doesnotwork: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=0x{{[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=0x{{[0-f]+}}, new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // explicit barrier after master
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+  // this is expected to come earlier and at MASTER:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/serialized.c b/final/runtime/test/ompt/tasks/serialized.c
new file mode 100644
index 0000000..a2c102a
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/serialized.c
@@ -0,0 +1,152 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <math.h>
+
+int main() {
+  omp_set_nested(0);
+  print_frame(0);
+#pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+#pragma omp master
+    {
+      print_ids(0);
+      void *creator_frame = get_frame_address(0);
+      int t = (int)sin(0.1);
+#pragma omp task if (t)
+      {
+        void *task_frame = get_frame_address(0);
+        if (creator_frame == task_frame) {
+          // Assume this code was inlined which the compiler is allowed to do.
+          print_frame(0);
+        } else {
+          // The exit frame must be our parent!
+          print_frame_from_outlined_fn(1);
+        }
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_ids(0);
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}
+  // CHECK-SAME: task_id={{[0-9]+}}, actual_parallelism=1, index=1, flags=1 
+
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)
+  // CHECK-SAME: =[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]]
+  // CHECK-SAME: parent_task_frame.exit=[[NULL]]
+  // CHECK-SAME: parent_task_frame.reenter=0x{{[0-f]+}}
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2
+  // CHECK-SAME: codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
+
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address
+  // CHECK-SAME: =[[EXIT:0x[0-f]+]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1
+  // CHECK-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]]
+  // CHECK-SAME: task_id=[[PARENT_TASK_ID]],
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: parent_task_frame.exit=[[EXIT]]
+  // CHECK-SAME: parent_task_frame.reenter=0x{{[0-f]+}}
+  // CHECK-SAME: new_task_id=[[TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule:
+  // CHECK-SAME: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address
+  // CHECK-SAME: =[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]]
+  // CHECK-SAME: exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 2
+  // CHECK-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[PARENT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule
+  // CHECK-SAME: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end
+  // parallel_id is 0 because the region ended in the barrier!
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address
+  // CHECK-SAME: =[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1
+  // CHECK-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[PARENT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)={{0x[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // CHECK-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/task_in_joinbarrier.c b/final/runtime/test/ompt/tasks/task_in_joinbarrier.c
new file mode 100644
index 0000000..8228add
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/task_in_joinbarrier.c
@@ -0,0 +1,91 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=0x{{[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=0x{{[0-f]+}}, new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/task_memory.c b/final/runtime/test/ompt/tasks/task_memory.c
new file mode 100644
index 0000000..a48cef2
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/task_memory.c
@@ -0,0 +1,108 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+#define USE_PRIVATE_TOOL 1
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x;
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp master
+    {
+#pragma omp task
+      { x++; }
+#pragma omp task firstprivate(x)
+      { x++; }
+    }
+  }
+
+  return 0;
+}
+
+static void on_ompt_callback_implicit_task(ompt_scope_endpoint_t endpoint,
+                                           ompt_data_t *parallel_data,
+                                           ompt_data_t *task_data,
+                                           unsigned int team_size,
+                                           unsigned int thread_num, int flag) {
+  void *addr = NULL;
+  size_t size = 0;
+  int result = ompt_get_task_memory(&addr, &size, 0);
+  switch (endpoint) {
+  case ompt_scope_begin:
+    task_data->value = ompt_get_unique_id();
+    printf("ompt_event_implicit_task_begin: task_id=%" PRIu64
+           ", memory_addr=%p, memory_size=%lu, result=%d \n",
+           task_data->value, addr, size, result);
+    break;
+  case ompt_scope_end:
+    printf("ompt_event_implicit_task_end: task_id=%" PRIu64
+           ", memory_addr=%p, memory_size=%lu, result=%d \n",
+           task_data->value, addr, size, result);
+    break;
+  }
+}
+
+static void
+on_ompt_callback_task_create(ompt_data_t *encountering_task_data,
+                             const ompt_frame_t *encountering_task_frame,
+                             ompt_data_t *new_task_data, int flags,
+                             int has_dependences, const void *codeptr_ra) {
+  if (flags & ompt_task_initial)
+    return; // not interested in the initial task
+  new_task_data->value = ompt_get_unique_id();
+  void *addr = NULL;
+  size_t size = 0;
+  printf("ompt_event_task_create: task_id=%" PRIu64 "\n", new_task_data->value);
+}
+
+static void on_ompt_callback_task_schedule(ompt_data_t *first_task_data,
+                                           ompt_task_status_t prior_task_status,
+                                           ompt_data_t *second_task_data) {
+  void *addr = NULL;
+  size_t size = 0;
+  int result = ompt_get_task_memory(&addr, &size, 0);
+  printf("ompt_event_task_schedule: task_id=%" PRIu64
+         ", memory_addr=%p, memory_size=%lu, result=%d\n",
+         first_task_data->value, addr, size, result);
+}
+
+int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
+                    ompt_data_t *tool_data) {
+  ompt_set_callback = (ompt_set_callback_t)lookup("ompt_set_callback");
+  ompt_get_unique_id = (ompt_get_unique_id_t)lookup("ompt_get_unique_id");
+  ompt_get_task_memory = (ompt_get_task_memory_t)lookup("ompt_get_task_memory");
+
+  register_callback(ompt_callback_implicit_task);
+  register_callback(ompt_callback_task_create);
+  register_callback(ompt_callback_task_schedule);
+  printf("0: NULL_POINTER=%p\n", (void *)NULL);
+  return 1; // success
+}
+
+void ompt_finalize(ompt_data_t *tool_data) {}
+
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+                                          const char *runtime_version) {
+  static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
+                                                            &ompt_finalize, 0};
+  return &ompt_start_tool_result;
+}
+
+// CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+// CHECK: ompt_event_implicit_task_begin: task_id=[[TASK_ID:[0-9]+]]
+// CHECK-SAME: memory_addr=[[NULL]], memory_size=0, result=0
+
+// CHECK: ompt_event_task_create: task_id=[[TASK_ID_0:[0-9]+]]
+// CHECK: ompt_event_task_create: task_id=[[TASK_ID_1:[0-9]+]]
+
+// Expects non-zero address, size, and result
+// CHECK-DAG: ompt_event_task_schedule: task_id=[[TASK_ID_0]],
+// memory_addr=0x{{[0-f]+}}, memory_size={{[1-9][0-9]*}}, result=1
+// CHECK-DAG: ompt_event_task_schedule: task_id=[[TASK_ID_1]],
+// memory_addr=0x{{[0-f]+}}, memory_size={{[1-9][0-9]*}}, result=1
+
+// CHECK: ompt_event_implicit_task_end: task_id=[[TASK_ID]]
+// CHECK-SAME: memory_addr=[[NULL]], memory_size=0, result=0
diff --git a/final/runtime/test/ompt/tasks/task_types.c b/final/runtime/test/ompt/tasks/task_types.c
new file mode 100644
index 0000000..1522635
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/task_types.c
@@ -0,0 +1,220 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+#include <math.h>
+
+int main() {
+  //initialize the OpenMP runtime
+  omp_get_num_threads();
+
+  // initial task
+  print_ids(0);
+
+  int x;
+// implicit task
+#pragma omp parallel num_threads(1)
+  {
+    print_ids(0);
+    x++;
+  }
+
+#pragma omp parallel num_threads(2)
+  {
+// explicit task
+#pragma omp single
+#pragma omp task
+    {
+      print_ids(0);
+      x++;
+    }
+// explicit task with undeferred
+#pragma omp single
+#pragma omp task if (0)
+    {
+      print_ids(0);
+      x++;
+    }
+
+// explicit task with untied
+#pragma omp single
+#pragma omp task untied
+    {
+      // Output of thread_id is needed to know on which thread task is executed
+      printf("%" PRIu64 ": explicit_untied\n", ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
+      x++;
+#pragma omp taskyield
+      printf("%" PRIu64 ": explicit_untied(2)\n",
+             ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
+      x++;
+#pragma omp taskwait
+      printf("%" PRIu64 ": explicit_untied(3)\n",
+             ompt_get_thread_data()->value);
+      print_ids(0);
+      print_frame(1);
+      x++;
+    }
+// explicit task with final
+#pragma omp single
+#pragma omp task final(1)
+    {
+      print_ids(0);
+      x++;
+// nested explicit task with final and undeferred
+#pragma omp task
+      {
+        print_ids(0);
+        x++;
+      }
+    }
+
+    // Mergeable task test deactivated for now
+    // explicit task with mergeable
+    /*
+    #pragma omp task mergeable if((int)sin(0))
+    {
+      print_ids(0);
+      x++;
+    }
+    */
+
+    // TODO: merged task
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}
+  // CHECK-SAME: task_id=[[INITIAL_TASK_ID:[0-9]+]], actual_parallelism=1, index=1, flags=1 
+
+  // CHECK-NOT: 0: parallel_data initially not null
+
+  // initial task
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id={{[0-9]+}}
+  // CHECK-SAME: task_id=[[INITIAL_TASK_ID]], exit_frame=[[NULL]]
+  // CHECK-SAME: reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_initial=1, thread_num=0
+
+  // implicit task
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id={{[0-9]+}}
+  // CHECK-SAME: task_id={{[0-9]+}}, exit_frame={{0x[0-f]+}}
+  // CHECK-SAME: reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_implicit|ompt_task_undeferred=134217730
+  // CHECK-SAME: thread_num=0
+
+  // explicit task
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_1:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[EXPLICIT_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_1]]: task level 0: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK-SAME: task_id=[[EXPLICIT_TASK_ID]], exit_frame={{0x[0-f]+}}
+  // CHECK-SAME: reenter_frame=[[NULL]], task_type=ompt_task_explicit=4
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // explicit task with undeferred
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_UNDEFERRED_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_2:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[EXPLICIT_UNDEFERRED_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_2]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNDEFERRED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // explicit task with untied
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_UNTIED_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: has_dependences=no
+
+  // Here the thread_id cannot be taken from a schedule event as there
+  // may be multiple of those
+  // CHECK: [[THREAD_ID_3:[0-9]+]]: explicit_untied
+  // CHECK: [[THREAD_ID_3]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // after taskyield
+  // CHECK: [[THREAD_ID_3_2:[0-9]+]]: explicit_untied(2)
+  // CHECK: [[THREAD_ID_3_2]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // after taskwait
+  // CHECK: [[THREAD_ID_3_3:[0-9]+]]: explicit_untied(3)
+  // CHECK: [[THREAD_ID_3_3]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_UNTIED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_untied=268435460
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // explicit task with final
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[EXPLICIT_FINAL_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_final=536870916
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_4:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[EXPLICIT_FINAL_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_4]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[EXPLICIT_FINAL_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_final=536870916
+  // CHECK-SAME: thread_num={{[01]}}
+
+  // nested explicit task with final and undeferred
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parent_task_frame.exit={{0x[0-f]+}}
+  // CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // CHECK-SAME: new_task_id=[[NESTED_FINAL_UNDEFERRED_TASK_ID:[0-9]+]]
+  // CHECK-SAME: codeptr_ra={{0x[0-f]+}}
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred
+  // CHECK-SAME:|ompt_task_final=671088644
+  // CHECK-SAME: has_dependences=no
+
+  // CHECK: [[THREAD_ID_5:[0-9]+]]: ompt_event_task_schedule:
+  // CHECK-SAME: second_task_id=[[NESTED_FINAL_UNDEFERRED_TASK_ID]]
+
+  // CHECK: [[THREAD_ID_5]]: task level 0: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[NESTED_FINAL_UNDEFERRED_TASK_ID]]
+  // CHECK-SAME: exit_frame={{0x[0-f]+}}, reenter_frame=[[NULL]]
+  // CHECK-SAME: task_type=ompt_task_explicit|ompt_task_undeferred
+  // CHECK-SAME:|ompt_task_final=671088644
+  // CHECK-SAME: thread_num={{[01]}}
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/task_types_serialized.c b/final/runtime/test/ompt/tasks/task_types_serialized.c
new file mode 100644
index 0000000..3fe163e
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/task_types_serialized.c
@@ -0,0 +1,114 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// REQUIRES: ompt
+
+#include "callback.h"
+#include <omp.h>
+
+__attribute__ ((noinline)) // workaround for bug in icc
+void print_task_type(int id)
+{
+  #pragma omp critical
+  {
+    int task_type;
+    char buffer[2048];
+    ompt_get_task_info(0, &task_type, NULL, NULL, NULL, NULL);
+    format_task_type(task_type, buffer);
+    printf("%" PRIu64 ": id=%d task_type=%s=%d\n", ompt_get_thread_data()->value, id, buffer, task_type);
+  }
+};
+
+int main()
+{
+  //initial task
+  print_task_type(0);
+
+  int x;
+  //implicit task
+  #pragma omp parallel num_threads(1)
+  {
+    print_task_type(1);
+    x++;
+  }
+
+  #pragma omp parallel num_threads(1)
+  #pragma omp master
+  {
+    //explicit task
+    #pragma omp task
+    {
+      print_task_type(2);
+      x++;
+    }
+
+    //explicit task with undeferred
+    #pragma omp task if(0)
+    {
+      print_task_type(3);
+      x++;
+    }
+
+    //explicit task with untied
+    #pragma omp task untied
+    {
+      print_task_type(4);
+      x++;
+    }
+
+    //explicit task with final
+    #pragma omp task final(1)
+    {
+      print_task_type(5);
+      x++;
+      //nested explicit task with final and undeferred
+      #pragma omp task
+      {
+        print_task_type(6);
+        x++;
+      }
+    }
+
+/*
+    //TODO:not working
+    //explicit task with mergeable
+    #pragma omp task mergeable
+    {
+      print_task_type(7);
+      x++;
+    }
+*/
+
+    //TODO: merged task
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin: parallel_id={{[0-9]+}}, task_id={{[0-9]+}}, actual_parallelism=1, index=1, flags=1
+  // CHECK: {{^}}[[MASTER_ID]]: id=0 task_type=ompt_task_initial=1
+  // CHECK: {{^}}[[MASTER_ID]]: id=1 task_type=ompt_task_implicit|ompt_task_undeferred=134217730
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=2 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=3 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_untied=402653188, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=4 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_untied=402653188
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=5 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644
+
+  // CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644, has_dependences=no
+  // CHECK: {{^[0-9]+}}: id=6 task_type=ompt_task_explicit|ompt_task_undeferred|ompt_task_final=671088644
+
+  // ___CHECK: {{^[0-9]+}}: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id={{[0-9]+}}, codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit|ompt_task_undeferred=134217732, has_dependences=no
+  // ___CHECK: {{^[0-9]+}}: id=7 task_type=ompt_task_explicit|ompt_task_undeferred=134217732
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/taskloop.c b/final/runtime/test/ompt/tasks/taskloop.c
new file mode 100644
index 0000000..af7f778
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/taskloop.c
@@ -0,0 +1,80 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | FileCheck --check-prefix=TASKS %s
+// REQUIRES: ompt
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+// GCC 6 has support for taskloops, but at least 6.3.0 is crashing on this test
+// UNSUPPORTED: gcc-6
+
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  unsigned int i, x;
+
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp barrier
+
+#pragma omp master
+#pragma omp taskloop
+    for (i = 0; i < 5; i += 3) {
+      x++;
+    }
+  }
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+  // CHECK-SAME: parent_task_id={{[0-9]+}}
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]]
+  // CHECK-SAME: requested_team_size=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID1:[0-9]+]]
+  // CHECK-SAME: team_size=2, thread_num=0
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskloop_begin:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]], count=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: new_task_id=[[TASK_ID1:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: new_task_id=[[TASK_ID2:[0-9]+]]
+  // CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS]]
+  // CHECK-SAME: task_type=ompt_task_explicit=4
+  // CHECK-NOT: {{^}}[[MASTER_ID]]: ompt_event_task_create:
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskloop_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+  // CHECK-SAME: parent_task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK-SAME: count=2
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_begin:
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_wait_taskgroup_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_taskgroup_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID1]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id=0
+  // CHECK-SAME: task_id=[[IMPLICIT_TASK_ID1]], team_size=2, thread_num=0
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+  // CHECK-SAME: parallel_id=[[PARALLEL_ID]]
+
+  // TASKS: ompt_event_initial_task_begin:{{.*}} task_id={{[0-9]+}}
+  // TASKS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_taskloop_begin:
+  // TASKS: ompt_event_task_create:{{.*}} new_task_id=[[TASK_ID1:[0-9]+]]
+  // TASKS-SAME: task_type=ompt_task_explicit
+  // TASKS-DAG: ompt_event_task_create:{{.*}} new_task_id=[[TASK_ID2:[0-9]+]]
+  // Schedule events:
+  // TASKS-DAG: {{^.*}}first_task_id={{[0-9]+}}, second_task_id=[[TASK_ID1]]
+  // TASKS-DAG: {{^.*}}first_task_id=[[TASK_ID1]], second_task_id={{[0-9]+}}
+  // TASKS-DAG: {{^.*}}first_task_id={{[0-9]+}}, second_task_id=[[TASK_ID2]]
+  // TASKS-DAG: {{^.*}}first_task_id=[[TASK_ID2]], second_task_id={{[0-9]+}}
+  // TASKS-NOT: ompt_event_task_schedule
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/taskyield.c b/final/runtime/test/ompt/tasks/taskyield.c
new file mode 100644
index 0000000..2dd0fa1
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/taskyield.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Current GOMP interface implements taskyield as stub
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>   
+#include <unistd.h>
+
+int main()
+{
+  int condition=0, x=0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp master
+    {
+        #pragma omp task shared(condition)
+        {
+          OMPT_SIGNAL(condition);
+          OMPT_WAIT(condition,2);
+        }
+        OMPT_WAIT(condition,1);
+        #pragma omp task shared(x)
+        {
+          x++;
+        }
+        printf("%" PRIu64 ": before yield\n", ompt_get_thread_data()->value);
+        #pragma omp taskyield
+        printf("%" PRIu64 ": after yield\n", ompt_get_thread_data()->value);
+        OMPT_SIGNAL(condition);
+    }
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size={{[0-9]+}}, thread_num={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[WORKER_TASK:[0-9]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=no
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id={{[0-9]+}}, parent_task_frame.exit={{0x[0-f]+}}, parent_task_frame.reenter={{0x[0-f]+}}, new_task_id=[[MAIN_TASK:[0-9]+]], codeptr_ra={{0x[0-f]+}}, task_type=ompt_task_explicit=4, has_dependences=no
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[MAIN_TASK]], prior_task_status=ompt_task_yield=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_schedule: first_task_id=[[MAIN_TASK]], second_task_id=[[IMPLICIT_TASK_ID]], prior_task_status=ompt_task_complete=1
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_task_schedule: first_task_id={{[0-9]+}}, second_task_id=[[WORKER_TASK]], prior_task_status=ompt_task_switch=7
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[WORKER_TASK]], second_task_id={{[0-9]+}}, prior_task_status=ompt_task_complete=1
+
+
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/tasks/untied_task.c b/final/runtime/test/ompt/tasks/untied_task.c
new file mode 100644
index 0000000..4ee3f11
--- /dev/null
+++ b/final/runtime/test/ompt/tasks/untied_task.c
@@ -0,0 +1,108 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h> 
+
+int main()
+{
+  int condition=0;
+  omp_set_nested(0);
+  print_frame(0);
+  #pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+    #pragma omp master
+    {
+      print_ids(0);
+      #pragma omp task untied shared(condition)
+      {
+        OMPT_SIGNAL(condition);
+        print_frame(1);
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+        #pragma omp task if(0)
+        {
+          print_ids(0);
+          print_ids(1);
+          print_ids(2);
+        }
+        print_ids(0);
+        print_ids(1);
+        print_ids(2);
+      }
+      OMPT_WAIT(condition,1);
+      print_ids(0);
+    }
+    #pragma omp barrier
+    print_ids(0);
+  }
+
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_create'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_task_schedule'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
+
+
+  // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+
+  // make sure initial data pointers are null
+  // CHECK-NOT: 0: new_task_data initially not null
+  
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter=0x{{[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
+  // nested parallel masters
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // <- ompt_event_task_create would be expected here
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_task_create: parent_task_id=[[IMPLICIT_TASK_ID]], parent_task_frame.exit=[[EXIT]], parent_task_frame.reenter=0x{{[0-f]+}}, new_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[TASK_FUNCTION:0x[0-f]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // explicit barrier after master
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // implicit barrier parallel
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+  // this is expected to come earlier and at MASTER:
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[IMPLICIT_TASK_ID]], second_task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: __builtin_frame_address(1)=[[TASK_EXIT:0x[0-f]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], exit_frame=[[TASK_EXIT]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: task level 2: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_schedule: first_task_id=[[TASK_ID]], second_task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_task_end: task_id=[[TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], reenter_frame=[[NULL]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/auto.c b/final/runtime/test/ompt/worksharing/for/auto.c
new file mode 100644
index 0000000..17d26f5
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/auto.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for auto = static schedule
+// XFAIL: gcc
+
+#define SCHEDULE auto
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/auto_serialized.c b/final/runtime/test/ompt/worksharing/for/auto_serialized.c
new file mode 100644
index 0000000..f756166
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/auto_serialized.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for auto = static schedule
+// XFAIL: gcc
+
+#define SCHEDULE auto
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/auto_split.c b/final/runtime/test/ompt/worksharing/for/auto_split.c
new file mode 100644
index 0000000..d82e3fd
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/auto_split.c
@@ -0,0 +1,8 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for auto = static schedule
+// XFAIL: gcc
+
+#define SCHEDULE auto
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/base.h b/final/runtime/test/ompt/worksharing/for/base.h
new file mode 100644
index 0000000..8a496d9
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/base.h
@@ -0,0 +1,43 @@
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  unsigned int i;
+
+  #pragma omp parallel for num_threads(4) schedule(SCHEDULE)
+  for (i = 0; i < 4; i++) {
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/base_serialized.h b/final/runtime/test/ompt/worksharing/for/base_serialized.h
new file mode 100644
index 0000000..3376b37
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/base_serialized.h
@@ -0,0 +1,28 @@
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  unsigned int i;
+
+  #pragma omp parallel for num_threads(1) schedule(SCHEDULE)
+  for (i = 0; i < 1; i++) {
+  }
+  
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=1, codeptr_ra=0x{{[0-f]+}}, invoker={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[IMPLICIT_TASK_ID]], codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[PARALLEL_ID,0]}}, task_id=[[IMPLICIT_TASK_ID]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/base_split.h b/final/runtime/test/ompt/worksharing/for/base_split.h
new file mode 100644
index 0000000..0f1fed3
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/base_split.h
@@ -0,0 +1,66 @@
+#include "callback.h"
+#include <omp.h>
+
+/* With the combined parallel-for construct (base.h), the return-addresses are hard to compare.
+   With the separate parallel and for-nowait construct, the addresses become more predictable,
+   but the begin of the for-loop still generates additional code, so the offset of loop-begin 
+   to the label is >4 Byte.
+*/
+
+int main()
+{
+  unsigned int i;
+
+  #pragma omp parallel num_threads(4) 
+  {
+    print_current_address(0);
+    #pragma omp for schedule(SCHEDULE) nowait
+    for (i = 0; i < 4; i++) {
+      print_fuzzy_address(1);
+    }
+    print_fuzzy_address(2);
+  }
+  print_fuzzy_address(3);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[PARALLEL_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=[[LOOP_BEGIN_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, invoker={{[0-9]+}}, codeptr_ra=[[PARALLEL_RETURN_ADDRESS]]
+  // CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[PARALLEL_RETURN_ADDRESS]]
+  
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=0x{{[0-f]+}}
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_loop_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[LOOP_END_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK: {{^}}[[THREAD_ID]]: fuzzy_address={{.*}}[[LOOP_END_RETURN_ADDRESS]]
+
+
+  // CHECK-LOOP: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK-LOOP: 0: ompt_event_runtime_shutdown
+  // CHECK-LOOP: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id={{[0-9]+}}, parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra={{0x[0-f]+}}, invoker={{[0-9]+}}
+  // CHECK-LOOP: {{^}}[[MASTER_ID]]: ompt_event_loop_begin: parallel_id=[[PARALLEL_ID]], parent_task_id={{[0-9]+}}, codeptr_ra=[[LOOP_BEGIN_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+  // CHECK-LOOP: {{^}}{{[0-9]+}}: fuzzy_address={{.*}}[[LOOP_BEGIN_RETURN_ADDRESS]]
+
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/for/dynamic.c b/final/runtime/test/ompt/worksharing/for/dynamic.c
new file mode 100644
index 0000000..ca5ae10
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/dynamic.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+
+#define SCHEDULE dynamic
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/dynamic_serialized.c b/final/runtime/test/ompt/worksharing/for/dynamic_serialized.c
new file mode 100644
index 0000000..0f80929
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/dynamic_serialized.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+
+#define SCHEDULE dynamic
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/dynamic_split.c b/final/runtime/test/ompt/worksharing/for/dynamic_split.c
new file mode 100644
index 0000000..cf14971
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/dynamic_split.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#define SCHEDULE dynamic
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/guided.c b/final/runtime/test/ompt/worksharing/for/guided.c
new file mode 100644
index 0000000..01bff4e
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/guided.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+
+#define SCHEDULE guided
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/guided_serialized.c b/final/runtime/test/ompt/worksharing/for/guided_serialized.c
new file mode 100644
index 0000000..4b5096d
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/guided_serialized.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+
+#define SCHEDULE guided
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/guided_split.c b/final/runtime/test/ompt/worksharing/for/guided_split.c
new file mode 100644
index 0000000..7d560c2
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/guided_split.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#define SCHEDULE guided
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/runtime.c b/final/runtime/test/ompt/worksharing/for/runtime.c
new file mode 100644
index 0000000..bcf160f
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/runtime.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+
+#define SCHEDULE runtime
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/runtime_serialized.c b/final/runtime/test/ompt/worksharing/for/runtime_serialized.c
new file mode 100644
index 0000000..231d67d
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/runtime_serialized.c
@@ -0,0 +1,5 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+
+#define SCHEDULE runtime
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/runtime_split.c b/final/runtime/test/ompt/worksharing/for/runtime_split.c
new file mode 100644
index 0000000..7a677ed
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/runtime_split.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// UNSUPPORTED: gcc-4, gcc-5, gcc-6, gcc-7
+
+#define SCHEDULE runtime
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/for/static.c b/final/runtime/test/ompt/worksharing/for/static.c
new file mode 100644
index 0000000..4d99059
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/static.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for static schedule
+// XFAIL: gcc
+
+#define SCHEDULE static
+#include "base.h"
diff --git a/final/runtime/test/ompt/worksharing/for/static_serialized.c b/final/runtime/test/ompt/worksharing/for/static_serialized.c
new file mode 100644
index 0000000..4860d49
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/static_serialized.c
@@ -0,0 +1,7 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_serialized.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for static schedule
+// XFAIL: gcc
+
+#define SCHEDULE static
+#include "base_serialized.h"
diff --git a/final/runtime/test/ompt/worksharing/for/static_split.c b/final/runtime/test/ompt/worksharing/for/static_split.c
new file mode 100644
index 0000000..d8c88dd
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/for/static_split.c
@@ -0,0 +1,8 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %S/base_split.h
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=CHECK-LOOP %S/base_split.h
+// REQUIRES: ompt
+// GCC doesn't call runtime for static schedule
+// XFAIL: gcc
+
+#define SCHEDULE static
+#include "base_split.h"
diff --git a/final/runtime/test/ompt/worksharing/sections.c b/final/runtime/test/ompt/worksharing/sections.c
new file mode 100644
index 0000000..bafb743
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/sections.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// Some compilers generate code that does not distinguish between sections and loops
+// XFAIL: gcc, clang-3, clang-4, clang-5, icc-16, icc-17
+// UNSUPPORTED: icc-18
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel sections num_threads(2)
+  {
+    #pragma omp section
+    {
+      printf("%lu: section 1\n", ompt_get_thread_data()->value);
+    }
+    #pragma omp section
+    {
+      printf("%lu: section 2\n", ompt_get_thread_data()->value);
+    }
+  }
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_sections_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[SECT_BEGIN:0x[0-f]+]], count=2
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_sections_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[SECT_END:0x[0-f]+]]
+
+  // CHECK: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_sections_begin: parallel_id=[[PARALLEL_ID]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra=[[SECT_BEGIN]], count=2
+  // CHECK: {{^}}[[THREAD_ID]]: ompt_event_sections_end: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}}, codeptr_ra=[[SECT_END]]
+
+  return 0;
+}
diff --git a/final/runtime/test/ompt/worksharing/single.c b/final/runtime/test/ompt/worksharing/single.c
new file mode 100644
index 0000000..6b24f2d
--- /dev/null
+++ b/final/runtime/test/ompt/worksharing/single.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+// GCC generates code that does not call the runtime for the single construct
+// XFAIL: gcc
+
+#include "callback.h"
+#include <omp.h>
+
+int main()
+{
+  int x = 0;
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp single
+    {
+      printf("%" PRIu64 ": in single\n", ompt_get_thread_data()->value);
+      x++;
+    }
+  }
+
+  printf("x=%d\n", x);
+
+  // Check if libomp supports the callbacks for this test.
+  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_work'
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK-DAG: {{^}}[[THREAD_ID_1:[0-9]+]]: ompt_event_single_in_block_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], parent_task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]+}}, count=1
+  // CHECK-DAG: {{^}}[[THREAD_ID_1]]: in single
+  // CHECK-DAG: {{^}}[[THREAD_ID_1]]: ompt_event_single_in_block_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]+}}, count=1
+
+  // CHECK-DAG: {{^}}[[THREAD_ID_2:[0-9]+]]: ompt_event_single_others_begin: parallel_id=[[PARALLEL_ID:[0-9]+]], task_id=[[TASK_ID:[0-9]+]], codeptr_ra={{0x[0-f]+}}, count=1
+  // CHECK-DAG: {{^}}[[THREAD_ID_2]]: ompt_event_single_others_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra={{0x[0-f]+}}, count=1
+
+  return 0;
+}
diff --git a/final/runtime/test/parallel/omp_nested.c b/final/runtime/test/parallel/omp_nested.c
new file mode 100644
index 0000000..b5a3fbd
--- /dev/null
+++ b/final/runtime/test/parallel/omp_nested.c
@@ -0,0 +1,49 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * Test if the compiler supports nested parallelism
+ * By Chunhua Liao, University of Houston
+ * Oct. 2005
+ */
+int test_omp_nested()
+{
+#ifdef _OPENMP
+  if (omp_get_max_threads() > 4)
+    omp_set_num_threads(4);
+  if (omp_get_max_threads() < 2)
+    omp_set_num_threads(2);
+#endif
+
+  int counter = 0;
+#ifdef _OPENMP
+  omp_set_nested(1);
+  omp_set_max_active_levels(omp_get_supported_active_levels());
+#endif
+
+  #pragma omp parallel shared(counter)
+  {
+    #pragma omp critical
+    counter++;
+    #pragma omp parallel
+    {
+      #pragma omp critical
+      counter--;
+    }
+  }
+  return (counter != 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_nested()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_copyin.c b/final/runtime/test/parallel/omp_parallel_copyin.c
new file mode 100644
index 0000000..600f9b7
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_copyin.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+static int sum1 = 789;
+#pragma omp threadprivate(sum1)
+
+int test_omp_parallel_copyin()
+{
+  int sum, num_threads;
+  int known_sum;
+
+  sum = 0;
+  sum1 = 7;
+  num_threads = 0;
+
+  #pragma omp parallel copyin(sum1)
+  {
+    /*printf("sum1=%d\n",sum1);*/
+    int i;
+    #pragma omp for
+    for (i = 1; i < 1000; i++) {
+      sum1 = sum1 + i;
+    } /*end of for*/
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+      num_threads++;
+    } /*end of critical*/
+  } /* end of parallel*/
+  known_sum = (999 * 1000) / 2 + 7 * num_threads;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_copyin()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_default.c b/final/runtime/test/parallel/omp_parallel_default.c
new file mode 100644
index 0000000..0a8e09e
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_default.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_default()
+{
+  int i;
+  int sum;
+  int mysum;
+  int known_sum;
+  sum =0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+
+  #pragma omp parallel default(shared) private(i) private(mysum)
+  {
+    mysum = 0;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      mysum = mysum + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + mysum;
+    }   /* end of critical */
+  }   /* end of parallel */
+  if (known_sum != sum) {
+    fprintf(stderr, "KNOWN_SUM = %d; SUM = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_default()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_firstprivate.c b/final/runtime/test/parallel/omp_parallel_firstprivate.c
new file mode 100644
index 0000000..dbee76c
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_firstprivate.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+//static int sum1 = 789;
+
+int test_omp_parallel_firstprivate()
+{
+  int sum, num_threads,sum1;
+  int known_sum;
+
+  sum = 0;
+  sum1=7;
+  num_threads = 0;
+
+  #pragma omp parallel firstprivate(sum1)
+  {
+    /*printf("sum1=%d\n",sum1);*/
+    int i;
+    #pragma omp for
+    for (i = 1; i < 1000; i++) {
+      sum1 = sum1 + i;
+    } /*end of for*/
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+      num_threads++;
+    } /*end of critical*/
+  } /* end of parallel*/
+  known_sum = (999 * 1000) / 2 + 7 * num_threads;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_if.c b/final/runtime/test/parallel/omp_parallel_if.c
new file mode 100644
index 0000000..abbf3cd
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_if.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_if()
+{
+  int i;
+  int sum;
+  int known_sum;
+  int mysum;
+  int control=1;
+
+  sum =0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+  #pragma omp parallel private(i) if(control==0)
+  {
+    mysum = 0;
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      mysum = mysum + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + mysum;
+    }
+  }
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_num_threads.c b/final/runtime/test/parallel/omp_parallel_num_threads.c
new file mode 100644
index 0000000..8af1f9d
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_num_threads.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_num_threads()
+{
+  int num_failed;
+  int threads;
+  int nthreads;
+  int max_threads = 0;
+
+  num_failed = 0;
+
+  /* first we check how many threads are available */
+  #pragma omp parallel
+  {
+    #pragma omp master
+    max_threads = omp_get_num_threads ();
+  }
+
+  /* we increase the number of threads from one to maximum:*/
+  for(threads = 1; threads <= max_threads; threads++) {
+    nthreads = 0;
+    #pragma omp parallel reduction(+:num_failed) num_threads(threads)
+    {
+      num_failed = num_failed + !(threads == omp_get_num_threads());
+      #pragma omp atomic
+      nthreads += 1;
+    }
+    num_failed = num_failed + !(nthreads == threads);
+  }
+  return (!num_failed);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_num_threads()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_private.c b/final/runtime/test/parallel/omp_parallel_private.c
new file mode 100644
index 0000000..238e806
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_private.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+//static int sum1 = 789;
+
+int test_omp_parallel_private()
+{
+  int sum, num_threads,sum1;
+  int known_sum;
+
+  sum = 0;
+  num_threads = 0;
+
+  #pragma omp parallel private(sum1)
+  {
+    int i;
+    sum1 = 7;
+    /*printf("sum1=%d\n",sum1);*/
+    #pragma omp for
+    for (i = 1; i < 1000; i++) {
+      sum1 = sum1 + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+      num_threads++;
+    }
+  }
+  known_sum = (999 * 1000) / 2 + 7 * num_threads;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_reduction.c b/final/runtime/test/parallel/omp_parallel_reduction.c
new file mode 100644
index 0000000..bb00939
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_reduction.c
@@ -0,0 +1,254 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20    /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_parallel_reduction()
+{
+  int sum;
+  int known_sum;
+  double dsum;
+  double dknown_sum;
+  double dt=0.5; /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-9;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[LOOPCOUNT];
+  int i;
+  double dpt;
+  int result;
+
+  sum =0;
+  dsum=0;
+  product=1;
+  logic_and=1;
+  logic_or=0;
+  bit_and=1;
+  bit_or=0;
+  exclusiv_bit_or=0;
+  result=0;
+  dt = 1./3.;
+  known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+
+  /* Tests for integers */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:sum)
+  for (i=1;i<=LOOPCOUNT;i++) {
+    sum=sum+i;
+  }
+
+  if(known_sum!=sum) {
+    result++;
+    fprintf(stderr,"Error in sum with integers: Result was %d instead of %d\n",sum,known_sum);
+  }
+
+  diff = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:diff)
+  for (i=1;i<=LOOPCOUNT;++i) {
+    diff=diff-i;
+  }
+
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in difference with integers: Result was %d instead of 0.\n",diff);
+  }
+
+  /* Tests for doubles */
+  dsum=0;
+  dpt=1;
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:dsum)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dsum += pow(dt,i);
+  }
+
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f instead of %f (Difference: %E)\n",dsum,dknown_sum, dsum-dknown_sum);
+  }
+
+  dpt=1;
+
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:ddiff)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    ddiff -= pow(dt,i);
+  }
+  if( fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E instead of 0.0\n",ddiff);
+  }
+
+  /* Tests for product of integers */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(*:product)
+  for(i=1;i<=MAX_FACTOR;i++) {
+    product *= i;
+  }
+
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d instead of %d\n\n",product,known_product);
+  }
+
+  /* Tests for logical and */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = (logic_and && logics[i]);
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1.\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = logic_and && logics[i];
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2.\n");
+  }
+
+  /* Tests for logical or */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 1.\n");
+  }
+  logic_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2.\n");
+  }
+
+  /* Tests for bitwise and */
+  for(i=0;i<LOOPCOUNT;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = (bit_and & logics[i]);
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1.\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = bit_and & logics[i];
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2.\n");
+  }
+
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  /* Tests for bitwise or */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  /* Tests for bitwise xor */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/parallel/omp_parallel_shared.c b/final/runtime/test/parallel/omp_parallel_shared.c
new file mode 100644
index 0000000..3146ca6
--- /dev/null
+++ b/final/runtime/test/parallel/omp_parallel_shared.c
@@ -0,0 +1,46 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_shared()
+{
+  int i;
+  int sum;
+  int known_sum;
+
+  sum = 0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 ;
+
+  #pragma omp parallel private(i) shared(sum)
+  {
+
+    int mysum = 0;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      mysum = mysum + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + mysum;
+    }
+
+
+  }
+  if (known_sum != sum) {
+    fprintf(stderr, "KNOWN_SUM = %d; SUM = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_shared()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/bug_36720.c b/final/runtime/test/tasking/bug_36720.c
new file mode 100644
index 0000000..684d675
--- /dev/null
+++ b/final/runtime/test/tasking/bug_36720.c
@@ -0,0 +1,36 @@
+// RUN: %libomp-compile-and-run
+
+/*
+Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=36720
+
+Assertion failure at kmp_runtime.cpp(1715): nthreads > 0.
+OMP: Error #13: Assertion failure at kmp_runtime.cpp(1715).
+
+The assertion fails even with OMP_NUM_THREADS=1. If the second task is removed,
+everything runs to completion. If the "omp parallel for" directives are removed
+from inside the tasks, once again everything runs fine.
+*/
+
+#define N 1024
+
+int main() {
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; i++)
+      (void)0;
+  }
+
+  #pragma omp task
+  {
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < N; ++i)
+      (void)0;
+  }
+
+  #pragma omp taskwait
+
+  return 0;
+}
diff --git a/final/runtime/test/tasking/bug_nested_proxy_task.c b/final/runtime/test/tasking/bug_nested_proxy_task.c
new file mode 100644
index 0000000..f70e904
--- /dev/null
+++ b/final/runtime/test/tasking/bug_nested_proxy_task.c
@@ -0,0 +1,131 @@
+// RUN: %libomp-compile-and-run
+// The runtime currently does not get dependency information from GCC.
+// UNSUPPORTED: gcc
+
+#include <stdio.h>
+#include <omp.h>
+#include <pthread.h>
+#include "omp_my_sleep.h"
+
+/*
+ With task dependencies one can generate proxy tasks from an explicit task
+ being executed by a serial task team. The OpenMP runtime library didn't
+ expect that and tries to free the explicit task that is the parent of the
+ proxy task still working in background. It therefore has incomplete children
+ which triggers a debugging assertion.
+*/
+
+// Compiler-generated code (emulation)
+typedef long kmp_intptr_t;
+typedef int kmp_int32;
+
+typedef char bool;
+
+typedef struct ident {
+    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
+    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
+    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+                            /*  but currently used for storing region-specific ITT */
+                            /*  contextual information. */
+#endif /* USE_ITT_BUILD */
+    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
+    char const *psource;    /**< String describing the source location.
+                            The string is composed of semi-colon separated fields which describe the source file,
+                            the function and a pair of line numbers that delimit the construct.
+                             */
+} ident_t;
+
+typedef struct kmp_depend_info {
+     kmp_intptr_t               base_addr;
+     size_t                     len;
+     struct {
+         bool                   in:1;
+         bool                   out:1;
+     } flags;
+} kmp_depend_info_t;
+
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+
+typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
+    void *              shareds;            /**< pointer to block of pointers to shared vars   */
+    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
+    kmp_int32           part_id;            /**< part id for the task                          */
+} kmp_task_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+kmp_int32  __kmpc_global_thread_num  ( ident_t * );
+kmp_task_t*
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry );
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+kmp_int32 __kmpc_omp_task_with_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                      kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                      kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+#ifdef __cplusplus
+}
+#endif
+
+void *target(void *task)
+{
+    my_sleep( 0.1 );
+    __kmpc_proxy_task_completed_ooo((kmp_task_t*) task);
+    return NULL;
+}
+
+pthread_t target_thread;
+
+// User's code
+int task_entry(kmp_int32 gtid, kmp_task_t *task)
+{
+    pthread_create(&target_thread, NULL, &target, task);
+    return 0;
+}
+
+int main()
+{
+    int dep;
+
+#pragma omp taskgroup
+{
+/*
+ *  Corresponds to:
+    #pragma omp target nowait depend(out: dep)
+    {
+        my_sleep( 0.1 );
+    }
+*/
+    kmp_depend_info_t dep_info;
+    dep_info.base_addr = (long) &dep;
+    dep_info.len = sizeof(int);
+    // out = inout per spec and runtime expects this
+    dep_info.flags.in = 1;
+    dep_info.flags.out = 1;
+
+    kmp_int32 gtid = __kmpc_global_thread_num(NULL);
+    kmp_task_t *proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+    __kmpc_omp_task_with_deps(NULL,gtid,proxy_task,1,&dep_info,0,NULL);
+
+    #pragma omp task depend(in: dep)
+    {
+/*
+ *      Corresponds to:
+        #pragma omp target nowait
+        {
+            my_sleep( 0.1 );
+        }
+*/
+        kmp_task_t *nested_proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+        __kmpc_omp_task(NULL,gtid,nested_proxy_task);
+    }
+}
+
+    // only check that it didn't crash
+    return 0;
+}
diff --git a/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c b/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c
new file mode 100644
index 0000000..c07f399
--- /dev/null
+++ b/final/runtime/test/tasking/bug_proxy_task_dep_waiting.c
@@ -0,0 +1,134 @@
+// RUN: %libomp-compile-and-run
+// The runtime currently does not get dependency information from GCC.
+// UNSUPPORTED: gcc
+
+#include <stdio.h>
+#include <omp.h>
+#include <pthread.h>
+#include "omp_my_sleep.h"
+
+/*
+ An explicit task can have a dependency on a target task. If it is not
+ directly satisfied, the runtime should not wait but resume execution.
+*/
+
+// Compiler-generated code (emulation)
+typedef long kmp_intptr_t;
+typedef int kmp_int32;
+
+typedef char bool;
+
+typedef struct ident {
+    kmp_int32 reserved_1;   /**<  might be used in Fortran; see above  */
+    kmp_int32 flags;        /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC identifies this union member  */
+    kmp_int32 reserved_2;   /**<  not really used in Fortran any more; see above */
+#if USE_ITT_BUILD
+                            /*  but currently used for storing region-specific ITT */
+                            /*  contextual information. */
+#endif /* USE_ITT_BUILD */
+    kmp_int32 reserved_3;   /**< source[4] in Fortran, do not use for C++  */
+    char const *psource;    /**< String describing the source location.
+                            The string is composed of semi-colon separated fields which describe the source file,
+                            the function and a pair of line numbers that delimit the construct.
+                             */
+} ident_t;
+
+typedef struct kmp_depend_info {
+     kmp_intptr_t               base_addr;
+     size_t                     len;
+     struct {
+         bool                   in:1;
+         bool                   out:1;
+     } flags;
+} kmp_depend_info_t;
+
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+
+typedef struct kmp_task {                   /* GEH: Shouldn't this be aligned somehow? */
+    void *              shareds;            /**< pointer to block of pointers to shared vars   */
+    kmp_routine_entry_t routine;            /**< pointer to routine to call for executing task */
+    kmp_int32           part_id;            /**< part id for the task                          */
+} kmp_task_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+kmp_int32  __kmpc_global_thread_num  ( ident_t * );
+kmp_task_t*
+__kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                       size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                       kmp_routine_entry_t task_entry );
+void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask );
+kmp_int32 __kmpc_omp_task_with_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+                                      kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                                      kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task );
+#ifdef __cplusplus
+}
+#endif
+
+void *target(void *task)
+{
+    my_sleep( 0.1 );
+    __kmpc_proxy_task_completed_ooo((kmp_task_t*) task);
+    return NULL;
+}
+
+pthread_t target_thread;
+
+// User's code
+int task_entry(kmp_int32 gtid, kmp_task_t *task)
+{
+    pthread_create(&target_thread, NULL, &target, task);
+    return 0;
+}
+
+int main()
+{
+    int dep;
+
+/*
+ *  Corresponds to:
+    #pragma omp target nowait depend(out: dep)
+    {
+        my_sleep( 0.1 );
+    }
+*/
+    kmp_depend_info_t dep_info;
+    dep_info.base_addr = (long) &dep;
+    dep_info.len = sizeof(int);
+    // out = inout per spec and runtime expects this
+    dep_info.flags.in = 1;
+    dep_info.flags.out = 1;
+
+    kmp_int32 gtid = __kmpc_global_thread_num(NULL);
+    kmp_task_t *proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry);
+    __kmpc_omp_task_with_deps(NULL,gtid,proxy_task,1,&dep_info,0,NULL);
+
+    int first_task_finished = 0;
+    #pragma omp task shared(first_task_finished) depend(inout: dep)
+    {
+        first_task_finished = 1;
+    }
+
+    int second_task_finished = 0;
+    #pragma omp task shared(second_task_finished) depend(in: dep)
+    {
+        second_task_finished = 1;
+    }
+
+    // check that execution has been resumed and the runtime has not waited
+    // for the dependencies to be satisfied.
+    int error = (first_task_finished == 1);
+    error += (second_task_finished == 1);
+
+    #pragma omp taskwait
+
+    // by now all tasks should have finished
+    error += (first_task_finished != 1);
+    error += (second_task_finished != 1);
+
+    return error;
+}
diff --git a/final/runtime/test/tasking/bug_serial_taskgroup.c b/final/runtime/test/tasking/bug_serial_taskgroup.c
new file mode 100644
index 0000000..850bc90
--- /dev/null
+++ b/final/runtime/test/tasking/bug_serial_taskgroup.c
@@ -0,0 +1,16 @@
+// RUN: %libomp-compile-and-run
+
+/*
+ GCC failed this test because __kmp_get_gtid() instead of __kmp_entry_gtid()
+ was called in xexpand(KMP_API_NAME_GOMP_TASKGROUP_START)(void).
+ __kmp_entry_gtid() will initialize the runtime if not yet done which does not
+ happen with __kmp_get_gtid().
+ */
+
+int main()
+{
+    #pragma omp taskgroup
+    { }
+
+    return 0;
+}
diff --git a/final/runtime/test/tasking/kmp_detach_tasks_t1.c b/final/runtime/test/tasking/kmp_detach_tasks_t1.c
new file mode 100644
index 0000000..f1763ec
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_detach_tasks_t1.c
@@ -0,0 +1,113 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS='3' %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS='1' %libomp-run
+
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+// detached untied
+#define PTASK_FLAG_DETACHABLE 0x40
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+typedef struct ID {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+  void* dummy; // not used in the library
+} ident_t;
+
+typedef enum kmp_event_type_t {
+  KMP_EVENT_UNINITIALIZED = 0,
+  KMP_EVENT_ALLOW_COMPLETION = 1
+} kmp_event_type_t;
+
+typedef struct {
+  kmp_event_type_t type;
+  union {
+    void *task;
+  } ed;
+} kmp_event_t;
+
+typedef struct shar { // shareds used in the task
+} *pshareds;
+
+typedef struct task {
+  pshareds shareds;
+  int(*routine)(int,struct task*);
+  int part_id;
+// void *destructor_thunk; // optional, needs flag setting if provided
+// int priority; // optional, needs flag setting if provided
+// ------------------------------
+// privates used in the task:
+  omp_event_handle_t evt;
+} *ptask, kmp_task_t;
+
+typedef int(*task_entry_t)(int, ptask);
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int __kmpc_global_thread_num(void *id_ref);
+extern ptask __kmpc_omp_task_alloc(id *loc, int gtid, int flags,
+                                   size_t sz, size_t shar, task_entry_t rtn);
+extern int __kmpc_omp_task(id *loc, int gtid, ptask task);
+extern omp_event_handle_t __kmpc_task_allow_completion_event(
+                              ident_t *loc_ref, int gtid, ptask task);
+#if __cplusplus
+}
+#endif
+
+int volatile checker;
+
+// User's code, outlined into task entry
+int task_entry(int gtid, ptask task) {
+  checker = 1;
+  return 0;
+}
+
+int main() {
+  int i, j, gtid = __kmpc_global_thread_num(NULL);
+  int nt = omp_get_max_threads();
+  ptask task;
+  pshareds psh;
+  checker = 0;
+  omp_set_dynamic(0);
+  #pragma omp parallel //num_threads(N)
+  {
+    #pragma omp master
+    {
+      int gtid = __kmpc_global_thread_num(NULL);
+      omp_event_handle_t evt;
+/*
+      #pragma omp task detach(evt)
+      {}
+*/
+      task = (ptask)__kmpc_omp_task_alloc(NULL,gtid,PTASK_FLAG_DETACHABLE,sizeof(struct task),sizeof(struct shar),&task_entry);
+      psh = task->shareds;
+      evt = (omp_event_handle_t)__kmpc_task_allow_completion_event(NULL,gtid,task);
+      task->evt = evt;
+
+      __kmpc_omp_task(NULL, gtid, task);
+      my_sleep(2.0);
+      omp_fulfill_event(evt);
+
+    } // end master
+  } // end parallel
+
+  // check results
+  if (checker == 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed\n");
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/kmp_detach_tasks_t2.c b/final/runtime/test/tasking/kmp_detach_tasks_t2.c
new file mode 100644
index 0000000..66fcb8f
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_detach_tasks_t2.c
@@ -0,0 +1,116 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS='3' %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS='1' %libomp-run
+
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+// detached tied
+#define PTASK_FLAG_DETACHABLE 0x41
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+typedef struct ID {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+  void* dummy; // not used in the library
+} ident_t;
+
+typedef enum kmp_event_type_t {
+  KMP_EVENT_UNINITIALIZED = 0,
+  KMP_EVENT_ALLOW_COMPLETION = 1
+} kmp_event_type_t;
+
+typedef struct {
+  kmp_event_type_t type;
+  union {
+    void *task;
+  } ed;
+} kmp_event_t;
+
+typedef struct shar { // shareds used in the task
+} *pshareds;
+
+typedef struct task {
+  pshareds shareds;
+  int(*routine)(int,struct task*);
+  int part_id;
+// void *destructor_thunk; // optional, needs flag setting if provided
+// int priority; // optional, needs flag setting if provided
+// ------------------------------
+// privates used in the task:
+  omp_event_handle_t evt;
+} *ptask, kmp_task_t;
+
+typedef int(* task_entry_t)( int, ptask );
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int  __kmpc_global_thread_num(void *id_ref);
+extern int** __kmpc_omp_task_alloc(id *loc, int gtid, int flags,
+                                   size_t sz, size_t shar, task_entry_t rtn);
+extern int __kmpc_omp_task(id *loc, int gtid, kmp_task_t *task);
+extern omp_event_handle_t __kmpc_task_allow_completion_event(
+                              ident_t *loc_ref, int gtid, kmp_task_t *task);
+#ifdef __cplusplus
+}
+#endif
+
+int volatile checker;
+
+// User's code, outlined into task entry
+int task_entry(int gtid, ptask task) {
+  my_sleep(2.0);
+  checker = 1;
+  return 0;
+}
+
+int main() {
+  int i, j, gtid = __kmpc_global_thread_num(NULL);
+  int nt = omp_get_max_threads();
+  ptask task;
+  pshareds psh;
+  checker = 0;
+  omp_set_dynamic(0);
+  #pragma omp parallel //num_threads(N)
+  {
+    #pragma omp master
+    {
+      int gtid = __kmpc_global_thread_num(NULL);
+      omp_event_handle_t evt;
+/*
+      #pragma omp task detach(evt)
+      {}
+*/
+      task = (ptask)__kmpc_omp_task_alloc(NULL,gtid,PTASK_FLAG_DETACHABLE,
+                        sizeof(struct task),sizeof(struct shar),&task_entry);
+      psh = task->shareds;
+      evt = (omp_event_handle_t)__kmpc_task_allow_completion_event(NULL,gtid,task);
+      task->evt = evt;
+      __kmpc_omp_task(NULL, gtid, task);
+      omp_fulfill_event(evt);
+      #pragma omp taskwait
+      ;
+//      printf("after tw %d\n", omp_get_thread_num());
+    } // end master
+  } // end parallel
+
+  // check results
+  if (checker == 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed\n");
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/kmp_detach_tasks_t3.c b/final/runtime/test/tasking/kmp_detach_tasks_t3.c
new file mode 100644
index 0000000..e14bab6
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_detach_tasks_t3.c
@@ -0,0 +1,138 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS='3' %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS='1' %libomp-run
+// The runtime currently does not get dependency information from GCC.
+// UNSUPPORTED: gcc
+
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+// detached untied
+#define PTASK_FLAG_DETACHABLE 0x40
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+typedef struct ID {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+  void* dummy; // not used in the library
+} ident_t;
+
+typedef enum kmp_event_type_t {
+  KMP_EVENT_UNINITIALIZED = 0,
+  KMP_EVENT_ALLOW_COMPLETION = 1
+} kmp_event_type_t;
+
+typedef struct {
+  kmp_event_type_t type;
+  union {
+    void *task;
+  } ed;
+} kmp_event_t;
+
+typedef struct shar { // shareds used in the task
+} *pshareds;
+
+typedef struct task {
+  pshareds shareds;
+  int(*routine)(int,struct task*);
+  int part_id;
+// void *destructor_thunk; // optional, needs flag setting if provided
+// int priority; // optional, needs flag setting if provided
+// ------------------------------
+// privates used in the task:
+  omp_event_handle_t evt;
+} *ptask, kmp_task_t;
+
+typedef struct DEP {
+  size_t addr;
+  size_t len;
+  int flags;
+} dep;
+
+typedef int(* task_entry_t)( int, ptask );
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int  __kmpc_global_thread_num(void *id_ref);
+extern int** __kmpc_omp_task_alloc(id *loc, int gtid, int flags,
+                                   size_t sz, size_t shar, task_entry_t rtn);
+extern int __kmpc_omp_task_with_deps(id *loc, int gtid, ptask task, int nd,
+               dep *dep_lst, int nd_noalias, dep *noalias_dep_lst);
+extern int __kmpc_omp_task(id *loc, int gtid, kmp_task_t *task);
+extern omp_event_handle_t __kmpc_task_allow_completion_event(
+                              ident_t *loc_ref, int gtid, kmp_task_t *task);
+#ifdef __cplusplus
+}
+#endif
+
+int volatile checker;
+
+// User's code, outlined into task entry
+int task_entry(int gtid, ptask task) {
+  checker = 1;
+  return 0;
+}
+
+int main() {
+  int i, j, gtid = __kmpc_global_thread_num(NULL);
+  int nt = omp_get_max_threads();
+  ptask task;
+  pshareds psh;
+  checker = 0;
+  omp_set_dynamic(0);
+  #pragma omp parallel //num_threads(N)
+  {
+    #pragma omp master
+    {
+      #pragma omp task depend(inout:nt)
+      {
+        my_sleep(2.0);
+      }
+      int gtid = __kmpc_global_thread_num(NULL);
+      omp_event_handle_t evt;
+/*
+      #pragma omp task detach(evt)
+      {}
+*/
+      task = (ptask)__kmpc_omp_task_alloc(NULL,gtid,PTASK_FLAG_DETACHABLE,
+                        sizeof(struct task),sizeof(struct shar),&task_entry);
+      psh = task->shareds;
+      evt = (omp_event_handle_t)__kmpc_task_allow_completion_event(NULL,gtid,task);
+      task->evt = evt;
+
+      dep sdep;
+      sdep.addr = (size_t)&nt;
+      sdep.len = 0L;
+      sdep.flags = 3;
+
+      __kmpc_omp_task_with_deps(NULL,gtid,task,1,&sdep,0,0);
+      //__kmpc_omp_task(NULL, gtid, task);
+
+      omp_fulfill_event(evt);
+
+      #pragma omp taskwait
+      ;
+//      printf("after tw %d\n", omp_get_thread_num());
+    } // end master
+  } // end parallel
+
+  // check results
+  if (checker == 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed\n");
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp b/final/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
new file mode 100644
index 0000000..f2dea9d
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_modifier_simple_par_new.cpp
@@ -0,0 +1,99 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+// extern void* __kmpc_task_reduction_modifier_init(void *loc, int gtid, int
+// is_ws, int num, void* data);
+extern void *__kmpc_taskred_modifier_init(void *loc, int gtid, int is_ws,
+                                          int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  int *p_var_orig = &var;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+//  #pragma omp parallel reduction(task,+:var)
+#pragma omp parallel reduction(+ : var) shared(p_var_orig)
+  {
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_orig =
+        p_var_orig; // not used in this test but illustrates codegen
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_taskred_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        0, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 0);
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp b/final/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
new file mode 100644
index 0000000..2526d4e
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_modifier_simple_par_old.cpp
@@ -0,0 +1,93 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+extern void *__kmpc_task_reduction_modifier_init(void *loc, int gtid, int is_ws,
+                                                 int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+//  #pragma omp parallel reduction(task,+:var)
+#pragma omp parallel reduction(+ : var)
+  {
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_task_reduction_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        0, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 0);
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp b/final/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
new file mode 100644
index 0000000..e66cda9
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_modifier_simple_ws_new.cpp
@@ -0,0 +1,114 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+// extern void* __kmpc_task_reduction_modifier_init(void *loc, int gtid, int
+// flags, int num, void* data);
+extern void *__kmpc_taskred_modifier_init(void *loc, int gtid, int is_ws,
+                                          int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  void *reduce_orig; /**< original reduction item used for initialization */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  int *p_var_orig = &var;
+  int i;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+#pragma omp parallel private(i) shared(p_var_orig)
+//  #pragma omp for reduction(task,+:var)
+#pragma omp for reduction(+ : var)
+  for (i = 0; i < NT; ++i) // single iteration per thread
+  {
+    // generated code, which actually should be placed before
+    // loop iterations distribution, but placed here just to show the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_orig =
+        p_var_orig; // not used in this test but illustrates codegen
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_taskred_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        1, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    // end of generated code
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      // emulate task reduction here because of compiler bug:
+      // it mistakenly declines to accept in_reduction because var is private
+      // outside.
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    // generated code, which actually should be placed after loop completion
+    // but before barrier and before loop reduction. It placed here just to show
+    // the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 1);
+    // end of generated code
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp b/final/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
new file mode 100644
index 0000000..97d5cb5
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_modifier_simple_ws_old.cpp
@@ -0,0 +1,108 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <stdio.h>
+#include <omp.h>
+
+#define NT 4
+#define INIT 10
+
+/*
+The test emulates code generation needed for reduction with task modifier on
+parallel construct.
+
+Note: tasks could just use in_reduction clause, but compiler does not accept
+this because of bug: it mistakenly requires reduction item to be shared, which
+is only true for reduction on worksharing and wrong for task reductions.
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item);
+extern void *__kmpc_task_reduction_modifier_init(void *loc, int gtid, int is_ws,
+                                                 int num, void *data);
+extern void __kmpc_task_reduction_modifier_fini(void *loc, int gtid, int is_ws);
+extern int __kmpc_global_thread_num(void *);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct red_input {
+  void *reduce_shar; /**< shared between tasks item to reduce into */
+  size_t reduce_size; /**< size of data item in bytes */
+  // three compiler-generated routines (init, fini are optional):
+  void *reduce_init; /**< data initialization routine (single paramemter) */
+  void *reduce_fini; /**< data finalization routine */
+  void *reduce_comb; /**< data combiner routine */
+  unsigned flags; /**< flags for additional info from compiler */
+} red_input_t;
+
+void i_comb(void *lhs, void *rhs) { *(int *)lhs += *(int *)rhs; }
+
+int main() {
+  int var = INIT;
+  int i;
+  omp_set_dynamic(0);
+  omp_set_num_threads(NT);
+#pragma omp parallel private(i)
+//  #pragma omp for reduction(task,+:var)
+#pragma omp for reduction(+ : var)
+  for (i = 0; i < NT; ++i) // single iteration per thread
+  {
+    // generated code, which actually should be placed before
+    // loop iterations distribution, but placed here just to show the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    int gtid = __kmpc_global_thread_num(NULL);
+    void *tg; // pointer to taskgroup (optional)
+    red_input_t r_var;
+    r_var.reduce_shar = &var;
+    r_var.reduce_size = sizeof(var);
+    r_var.reduce_init = NULL;
+    r_var.reduce_fini = NULL;
+    r_var.reduce_comb = (void *)&i_comb;
+    tg = __kmpc_task_reduction_modifier_init(
+        NULL, // ident_t loc;
+        gtid,
+        1, // 1 - worksharing construct, 0 - parallel
+        1, // number of reduction objects
+        &r_var // related data
+        );
+    // end of generated code
+    var++;
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+    {
+      // emulate task reduction here because of compiler bug:
+      // it mistakenly declines to accept in_reduction because var is private
+      // outside.
+      int gtid = __kmpc_global_thread_num(NULL);
+      int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+      *p_var += 1;
+    }
+    if (omp_get_thread_num() > 0) {
+#pragma omp task /*in_reduction(+:var)*/ shared(var)
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+        int *p_var = (int *)__kmpc_task_reduction_get_th_data(gtid, tg, &var);
+        *p_var += 1;
+      }
+    }
+    // generated code, which actually should be placed after loop completion
+    // but before barrier and before loop reduction. It placed here just to show
+    // the idea,
+    // and to keep correctness the loop count is equal to number of threads
+    __kmpc_task_reduction_modifier_fini(NULL, gtid, 1);
+    // end of generated code
+  }
+  if (var == INIT + NT * 3 - 1) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed: var = %d (!= %d)\n", var, INIT + NT * 3 - 1);
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/kmp_task_reduction_nest.cpp b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
new file mode 100644
index 0000000..63dffe4
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_task_reduction_nest.cpp
@@ -0,0 +1,376 @@
+// RUN: %libomp-cxx-compile-and-run
+// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
+// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
+// XFAIL: gcc-4
+#include <cstdio>
+#include <cmath>
+#include <cassert>
+#include <omp.h>
+
+// Total number of loop iterations, should be multiple of T for this test
+#define N 10000
+
+// Flag to request lazy (1) or eager (0) allocation of reduction objects
+#ifndef FLG
+#define FLG 0
+#endif
+
+/*
+  // initial user's code that corresponds to pseudo code of the test
+  #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
+  {
+    for( int l = 0; l < N; ++l ) {
+      #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
+      {
+        i += l;
+        if( l%2 )
+          x *= 1.0 / (l + 1);
+        else
+          x *= (l + 1);
+      }
+    }
+
+    #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
+    {
+      for( int l = 0; l < N; ++l ) {
+        #pragma omp task firstprivate(l) in_reduction(+:j,y) \
+            in_reduction(*:x) in_reduction(-:k)
+        {
+          j += l;
+          k -= l;
+          y += (double)l;
+          if( l%2 )
+            x *= 1.0 / (l + 1);
+          else
+            x *= (l + 1);
+        }
+        #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
+        {
+          i -= l;
+          k -= l;
+          y += (double)l;
+        }
+        #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
+        {
+          j += l;
+          if( l%2 )
+            x *= 1.0 / (l + 1);
+          else
+            x *= (l + 1);
+        }
+      }
+    } // inner reduction
+
+    for( int l = 0; l < N; ++l ) {
+      #pragma omp task firstprivate(l) in_reduction(+:j)
+        j += l;
+    }
+  } // outer reduction
+*/
+
+//------------------------------------------------
+// OpenMP runtime library routines
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
+extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
+extern int __kmpc_global_thread_num(void*);
+#ifdef __cplusplus
+}
+#endif
+
+//------------------------------------------------
+// Compiler-generated code
+
+typedef struct _task_red_item {
+    void       *shar; // shared reduction item
+    size_t      size; // size of data item
+    void       *f_init; // data initialization routine
+    void       *f_fini; // data finalization routine
+    void       *f_comb; // data combiner routine
+    unsigned    flags;
+} _task_red_item_t;
+
+// int:+   no need in init/fini callbacks, valid for subtraction
+void __red_int_add_comb(void *lhs, void *rhs) // combiner
+{ *(int*)lhs += *(int*)rhs; }
+
+// long long:+   no need in init/fini callbacks, valid for subtraction
+void __red_llong_add_comb(void *lhs, void *rhs) // combiner
+{ *(long long*)lhs += *(long long*)rhs; }
+
+// double:*   no need in fini callback
+void __red_dbl_mul_init(void *data) // initializer
+{ *(double*)data = 1.0; }
+void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs *= *(double*)rhs; }
+
+// double:+   no need in init/fini callbacks
+void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
+{ *(double*)lhs += *(double*)rhs; }
+
+// ==============================
+
+void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
+{
+    for( int l = 0; l < N; ++l ) {
+        *pi += l;
+        if( l%2 )
+          *px *= 1.0 / (l + 1);
+        else
+          *px *= (l + 1);
+    }
+    for( int l = 0; l < N; ++l ) {
+        *pj += l;
+        *pk -= l;
+        *py += (double)l;
+        if( l%2 )
+            *px *= 1.0 / (l + 1);
+        else
+            *px *= (l + 1);
+
+        *pi -= l;
+        *pk -= l;
+        *py += (double)l;
+
+        *pj += l;
+        if( l%2 )
+            *px *= 1.0 / (l + 1);
+        else
+            *px *= (l + 1);
+    }
+    for( int l = 0; l < N; ++l ) {
+        *pj += l;
+    }
+}
+
+//------------------------------------------------
+// Test case
+int main()
+{
+  int nthreads = omp_get_max_threads();
+  int err = 0;
+  void** ptrs = (void**)malloc(nthreads*sizeof(void*));
+
+  // user's code ======================================
+  // variables for serial calculations:
+  int is = 3;
+  long long js = -9999999;
+  double xs = 99999.0;
+  long long ks = 99999999;
+  double ys = -99999999.0;
+  // variables for parallel calculations:
+  int ip = 3;
+  long long jp = -9999999;
+  double xp = 99999.0;
+  long long kp = 99999999;
+  double yp = -99999999.0;
+
+  calc_serial(&is, &js, &xs, &ks, &ys);
+  // ==================================================
+  for (int i = 0; i < nthreads; ++i)
+    ptrs[i] = NULL;
+  #pragma omp parallel
+  {
+    #pragma omp single nowait
+    {
+      // outer taskgroup reduces (i,j,x)
+      #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
+      {
+        _task_red_item_t red_data[3];
+        red_data[0].shar = &ip;
+        red_data[0].size = sizeof(ip);
+        red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+        red_data[0].f_fini = NULL; // no destructors needed
+        red_data[0].f_comb = (void*)&__red_int_add_comb;
+        red_data[0].flags = FLG;
+        red_data[1].shar = &jp;
+        red_data[1].size = sizeof(jp);
+        red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+        red_data[1].f_fini = NULL; // no destructors needed
+        red_data[1].f_comb = (void*)&__red_llong_add_comb;
+        red_data[1].flags = FLG;
+        red_data[2].shar = &xp;
+        red_data[2].size = sizeof(xp);
+        red_data[2].f_init = (void*)&__red_dbl_mul_init;
+        red_data[2].f_fini = NULL; // no destructors needed
+        red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
+        red_data[2].flags = FLG;
+        int gtid = __kmpc_global_thread_num(NULL);
+        void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+        for( int l = 0; l < N; l += 2 ) {
+          // 2 iterations per task to get correct x value; actually any even
+          // number of iters per task will work, otherwise x looses precision
+          #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
+          {
+            int gtid = __kmpc_global_thread_num(NULL);
+            int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
+            double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                        gtid, tg1, &xp);
+            if (!ptrs[gtid]) ptrs[gtid] = p_xp;
+
+            // user's pseudo-code ==============================
+            *p_ip += l;
+            *p_xp *= (l + 1);
+
+            *p_ip += l + 1;
+            *p_xp *= 1.0 / (l + 2);
+            // ==================================================
+          }
+        }
+        // inner taskgroup reduces (i,k,y), i is same object as in outer one
+        #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
+        {
+          _task_red_item_t red_data[3];
+          red_data[0].shar = &ip;
+          red_data[0].size = sizeof(ip);
+          red_data[0].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[0].f_fini = NULL; // no destructors needed
+          red_data[0].f_comb = (void*)&__red_int_add_comb;
+          red_data[0].flags = FLG;
+          red_data[1].shar = &kp;
+          red_data[1].size = sizeof(kp);
+          red_data[1].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[1].f_fini = NULL; // no destructors needed
+          red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
+          red_data[1].flags = FLG;
+          red_data[2].shar = &yp;
+          red_data[2].size = sizeof(yp);
+          red_data[2].f_init = NULL; // RTL will zero thread-specific objects
+          red_data[2].f_fini = NULL; // no destructors needed
+          red_data[2].f_comb = (void*)&__red_dbl_add_comb;
+          red_data[2].flags = FLG;
+          int gtid = __kmpc_global_thread_num(NULL);
+          void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
+
+          for( int l = 0; l < N; l += 2 ) {
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg1, &jp);
+              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg2, &kp);
+              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg1, &xp);
+              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg2, &yp);
+              // user's pseudo-code ==============================
+              *p_jp += l;
+              *p_kp -= l;
+              *p_yp += (double)l;
+              *p_xp *= (l + 1);
+
+              *p_jp += l + 1;
+              *p_kp -= l + 1;
+              *p_yp += (double)(l + 1);
+              *p_xp *= 1.0 / (l + 2);
+              // =================================================
+{
+  // the following code is here just to check __kmpc_task_reduction_get_th_data:
+  int tid = omp_get_thread_num();
+  void *addr1;
+  void *addr2;
+  addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
+  addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
+  if (addr1 != addr2) {
+    #pragma omp atomic
+      ++err;
+    printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
+  }
+  // from neighbour w/o taskgroup (should start lookup from current tg2)
+  if (tid > 0) {
+    if (ptrs[tid-1]) {
+      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
+      if (addr1 != addr2) {
+        #pragma omp atomic
+          ++err;
+        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+               tid, addr1, addr2);
+      }
+    }
+  } else {
+    if (ptrs[nthreads-1]) {
+      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
+      if (addr1 != addr2) {
+        #pragma omp atomic
+          ++err;
+        printf("Wrong thread-specific addresses %d s:%p n:%p\n",
+               tid, addr1, addr2);
+      }
+    }
+  }
+  // ----------------------------------------------
+}
+            }
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:y) in_reduction(-:i,k)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
+                                    gtid, tg2, &ip);
+              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg2, &kp);
+              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg2, &yp);
+
+              // user's pseudo-code ==============================
+              *p_ip -= l;
+              *p_kp -= l;
+              *p_yp += (double)l;
+
+              *p_ip -= l + 1;
+              *p_kp -= l + 1;
+              *p_yp += (double)(l + 1);
+              // =================================================
+            }
+            #pragma omp task firstprivate(l)
+            // in_reduction(+:j) in_reduction(*:x)
+            {
+              int gtid = __kmpc_global_thread_num(NULL);
+              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                                gtid, tg1, &jp);
+              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
+                                          gtid, tg1, &xp);
+              // user's pseudo-code ==============================
+              *p_jp += l;
+              *p_xp *= (l + 1);
+
+              *p_jp += l + 1;
+              *p_xp *= 1.0 / (l + 2);
+              // =================================================
+            }
+          }
+        } // inner reduction
+
+        for( int l = 0; l < N; l += 2 ) {
+          #pragma omp task firstprivate(l) // in_reduction(+:j)
+          {
+            int gtid = __kmpc_global_thread_num(NULL);
+            long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
+                                              gtid, tg1, &jp);
+            // user's pseudo-code ==============================
+            *p_jp += l;
+            *p_jp += l + 1;
+            // =================================================
+          }
+        }
+      } // outer reduction
+    } // end single
+  } // end parallel
+  // check results
+#if _DEBUG
+  printf("reduction flags = %u\n", FLG);
+#endif
+  if (ip == is && jp == js && ks == kp &&
+      fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
+    printf("passed\n");
+  else
+    printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
+      is, js, xs, ks, ys,
+      ip, jp, xp, kp, yp);
+  return 0;
+}
diff --git a/final/runtime/test/tasking/kmp_taskloop.c b/final/runtime/test/tasking/kmp_taskloop.c
new file mode 100644
index 0000000..4b13793
--- /dev/null
+++ b/final/runtime/test/tasking/kmp_taskloop.c
@@ -0,0 +1,159 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+#define N 4
+#define GRAIN 10
+#define STRIDE 3
+
+// globals
+int th_counter[N];
+int counter;
+
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+    void* dummy;
+} ident_t;
+
+typedef struct shar {
+    int(*pth_counter)[N];
+    int *pcounter;
+    int *pj;
+} *pshareds;
+
+typedef struct task {
+    pshareds shareds;
+    int(* routine)(int,struct task*);
+    int part_id;
+// privates:
+    unsigned long long lb; // library always uses ULONG
+    unsigned long long ub;
+    int st;
+    int last;
+    int i;
+    int j;
+    int th;
+} *ptask, kmp_task_t;
+
+typedef int(* task_entry_t)( int, ptask );
+
+void
+__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv)
+{
+// setup lastprivate flag
+    task_dst->last = lastpriv;
+// could be constructor calls here...
+}
+
+
+// OpenMP RTL interfaces
+typedef unsigned long long kmp_uint64;
+typedef long long kmp_int64;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int nogroup, int sched, kmp_int64 grainsize, void *task_dup );
+ptask
+__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags,
+                  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                  task_entry_t task_entry );
+void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs);
+int  __kmpc_global_thread_num(void *id_ref);
+#ifdef __cplusplus
+}
+#endif
+
+
+// User's code
+int task_entry(int gtid, ptask task)
+{
+    pshareds pshar = task->shareds;
+    for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) {
+        task->th = omp_get_thread_num();
+        __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1);
+        __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1);
+        task->j = task->i;
+    }
+    my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks
+    if( task->last ) {
+        *(pshar->pj) = task->j; // lastprivate
+    }
+    return 0;
+}
+
+int main()
+{
+    int i, j, gtid = __kmpc_global_thread_num(NULL);
+    ptask task;
+    pshareds psh;
+    omp_set_dynamic(0);
+    counter = 0;
+    for( i=0; i<N; ++i )
+        th_counter[i] = 0;
+    #pragma omp parallel num_threads(N)
+    {
+      #pragma omp master
+      {
+        int gtid = __kmpc_global_thread_num(NULL);
+/*
+ *  This is what the OpenMP runtime calls correspond to:
+    #pragma omp taskloop num_tasks(N) lastprivate(j)
+    for( i=0; i<N*GRAIN*STRIDE-1; i+=STRIDE )
+    {
+        int th = omp_get_thread_num();
+        #pragma omp atomic
+            counter++;
+        #pragma omp atomic
+            th_counter[th]++;
+        j = i;
+    }
+*/
+    task = __kmpc_omp_task_alloc(NULL,gtid,1,sizeof(struct task),sizeof(struct shar),&task_entry);
+    psh = task->shareds;
+    psh->pth_counter = &th_counter;
+    psh->pcounter = &counter;
+    psh->pj = &j;
+    task->lb = 0;
+    task->ub = N*GRAIN*STRIDE-2;
+    task->st = STRIDE;
+
+    __kmpc_taskloop(
+        NULL,             // location
+        gtid,             // gtid
+        task,             // task structure
+        1,                // if clause value
+        &task->lb,        // lower bound
+        &task->ub,        // upper bound
+        STRIDE,           // loop increment
+        0,                // 1 if nogroup specified
+        2,                // schedule type: 0-none, 1-grainsize, 2-num_tasks
+        N,                // schedule value (ignored for type 0)
+        (void*)&__task_dup_entry // tasks duplication routine
+        );
+      } // end master
+    } // end parallel
+// check results
+    if( j != N*GRAIN*STRIDE-STRIDE ) {
+        printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE);
+        return 1;
+    }
+    if( counter != N*GRAIN ) {
+        printf("Error, counter %d != %d\n",counter,N*GRAIN);
+        return 1;
+    }
+    for( i=0; i<N; ++i ) {
+        if( th_counter[i] % GRAIN ) {
+            printf("Error, th_counter[%d] = %d\n",i,th_counter[i]);
+            return 1;
+        }
+    }
+    printf("passed\n");
+    return 0;
+}
diff --git a/final/runtime/test/tasking/nested_parallel_tasking.c b/final/runtime/test/tasking/nested_parallel_tasking.c
new file mode 100644
index 0000000..4374d6e
--- /dev/null
+++ b/final/runtime/test/tasking/nested_parallel_tasking.c
@@ -0,0 +1,32 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+
+/*
+ * This test would hang when level instead of active level
+ * used to push task state.
+ */
+
+int main()
+{
+  // If num_threads is changed to a value greater than 1, then the test passes
+  #pragma omp parallel num_threads(1)
+  {
+    #pragma omp parallel
+    printf("Hello World from thread %d\n", omp_get_thread_num());
+  }
+
+  printf("omp_num_threads: %d\n", omp_get_max_threads());
+
+  #pragma omp parallel
+  {
+    #pragma omp master
+    #pragma omp task default(none)
+    {
+      printf("%d is executing this task\n", omp_get_thread_num());
+    }
+  }
+
+  printf("pass\n");
+  return 0;
+}
diff --git a/final/runtime/test/tasking/nested_task_creation.c b/final/runtime/test/tasking/nested_task_creation.c
new file mode 100644
index 0000000..c7c25fc
--- /dev/null
+++ b/final/runtime/test/tasking/nested_task_creation.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_my_sleep.h"
+
+/*
+ * This test creates tasks that themselves create a new task.
+ * The runtime has to take care that they are correctly freed.
+ */
+
+int main()
+{
+  #pragma omp task
+  {
+    #pragma omp task
+    {
+      my_sleep( 0.1 );
+    }
+  }
+
+  #pragma omp parallel num_threads(2)
+  {
+    #pragma omp single
+    #pragma omp task
+    {
+      #pragma omp task
+      {
+        my_sleep( 0.1 );
+      }
+    }
+  }
+
+  printf("pass\n");
+  return 0;
+}
diff --git a/final/runtime/test/tasking/omp50_task_depend_mtx.c b/final/runtime/test/tasking/omp50_task_depend_mtx.c
new file mode 100644
index 0000000..79c270e
--- /dev/null
+++ b/final/runtime/test/tasking/omp50_task_depend_mtx.c
@@ -0,0 +1,152 @@
+// RUN: %libomp-compile-and-run
+
+// Tests OMP 5.0 task dependences "mutexinoutset", emulates compiler codegen
+// Mutually exclusive tasks get same input dependency info array
+//
+// Task tree created:
+//      task0 task1
+//         \    / \
+//         task2   task5
+//           / \
+//       task3  task4
+//       /   \
+//  task6 <-->task7  (these two are mutually exclusive)
+//       \    /
+//       task8
+//
+#include <stdio.h>
+#include <omp.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#define mysleep(n) Sleep(n)
+#else
+#include <unistd.h>
+#define mysleep(n) usleep((n)*1000)
+#endif
+
+static int checker = 0; // to check if two tasks run simultaneously
+static int err = 0;
+#ifndef DELAY
+#define DELAY 100
+#endif
+
+// ---------------------------------------------------------------------------
+// internal data to emulate compiler codegen
+typedef int(*entry_t)(int, int**);
+typedef struct DEP {
+  size_t addr;
+  size_t len;
+  int flags;
+} dep;
+typedef struct ID {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+int thunk(int gtid, int** pshareds) {
+  int t = **pshareds;
+  int th = omp_get_thread_num();
+  #pragma omp atomic
+    ++checker;
+  printf("task __%d, th %d\n", t, th);
+  if (checker != 1) {
+    err++;
+    printf("Error1, checker %d != 1\n", checker);
+  }
+  mysleep(DELAY);
+  if (checker != 1) {
+    err++;
+    printf("Error2, checker %d != 1\n", checker);
+  }
+  #pragma omp atomic
+    --checker;
+  return 0;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int __kmpc_global_thread_num(id*);
+extern int** __kmpc_omp_task_alloc(id *loc, int gtid, int flags,
+                                   size_t sz, size_t shar, entry_t rtn);
+int
+__kmpc_omp_task_with_deps(id *loc, int gtid, int **task, int nd, dep *dep_lst,
+                          int nd_noalias, dep *noalias_dep_lst);
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of internal data
+// ---------------------------------------------------------------------------
+
+int main()
+{
+  int i1,i2,i3,i4;
+  omp_set_num_threads(2);
+  #pragma omp parallel
+  {
+    #pragma omp single nowait
+    {
+      dep sdep[2];
+      int **ptr;
+      int gtid = __kmpc_global_thread_num(&loc);
+      int t = omp_get_thread_num();
+      #pragma omp task depend(in: i1, i2)
+      { int th = omp_get_thread_num();
+        printf("task 0_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(in: i1, i3)
+      { int th = omp_get_thread_num();
+        printf("task 1_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(in: i2) depend(out: i1)
+      { int th = omp_get_thread_num();
+        printf("task 2_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(in: i1)
+      { int th = omp_get_thread_num();
+        printf("task 3_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(out: i2)
+      { int th = omp_get_thread_num();
+        printf("task 4_%d, th %d\n", t, th);
+        mysleep(DELAY+5); } // wait a bit longer than task 3
+      #pragma omp task depend(out: i3)
+      { int th = omp_get_thread_num();
+        printf("task 5_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+// compiler codegen start
+      // task1
+      ptr = __kmpc_omp_task_alloc(&loc, gtid, 0, 28, 16, thunk);
+      sdep[0].addr = (size_t)&i1;
+      sdep[0].len = 0;   // not used
+      sdep[0].flags = 4; // mx
+      sdep[1].addr = (size_t)&i4;
+      sdep[1].len = 0;   // not used
+      sdep[1].flags = 4; // mx
+      **ptr = t + 10; // init single shared variable
+      __kmpc_omp_task_with_deps(&loc, gtid, ptr, 2, sdep, 0, 0);
+
+      // task2
+      ptr = __kmpc_omp_task_alloc(&loc, gtid, 0, 28, 16, thunk);
+      **ptr = t + 20; // init single shared variable
+      __kmpc_omp_task_with_deps(&loc, gtid, ptr, 2, sdep, 0, 0);
+// compiler codegen end
+      #pragma omp task depend(in: i1)
+      { int th = omp_get_thread_num();
+        printf("task 8_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+    } // single
+  } // parallel
+  if (err == 0) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed\n");
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/omp50_task_depend_mtx2.c b/final/runtime/test/tasking/omp50_task_depend_mtx2.c
new file mode 100644
index 0000000..ec8a7d1
--- /dev/null
+++ b/final/runtime/test/tasking/omp50_task_depend_mtx2.c
@@ -0,0 +1,155 @@
+// RUN: %libomp-compile-and-run
+
+// Tests OMP 5.0 task dependences "mutexinoutset", emulates compiler codegen
+// Mutually exclusive tasks get input dependency info array sorted differently
+//
+// Task tree created:
+//      task0 task1
+//         \    / \
+//         task2   task5
+//           / \
+//       task3  task4
+//       /   \
+//  task6 <-->task7  (these two are mutually exclusive)
+//       \    /
+//       task8
+//
+#include <stdio.h>
+#include <omp.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#define mysleep(n) Sleep(n)
+#else
+#include <unistd.h>
+#define mysleep(n) usleep((n)*1000)
+#endif
+
+static int checker = 0; // to check if two tasks run simultaneously
+static int err = 0;
+#ifndef DELAY
+#define DELAY 100
+#endif
+
+// ---------------------------------------------------------------------------
+// internal data to emulate compiler codegen
+typedef int(*entry_t)(int, int**);
+typedef struct DEP {
+  size_t addr;
+  size_t len;
+  int flags;
+} dep;
+typedef struct ID {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+int thunk(int gtid, int** pshareds) {
+  int t = **pshareds;
+  int th = omp_get_thread_num();
+  #pragma omp atomic
+    ++checker;
+  printf("task __%d, th %d\n", t, th);
+  if (checker != 1) {
+    err++;
+    printf("Error1, checker %d != 1\n", checker);
+  }
+  mysleep(DELAY);
+  if (checker != 1) {
+    err++;
+    printf("Error2, checker %d != 1\n", checker);
+  }
+  #pragma omp atomic
+    --checker;
+  return 0;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int __kmpc_global_thread_num(id*);
+extern int** __kmpc_omp_task_alloc(id *loc, int gtid, int flags,
+                                   size_t sz, size_t shar, entry_t rtn);
+int
+__kmpc_omp_task_with_deps(id *loc, int gtid, int **task, int nd, dep *dep_lst,
+                          int nd_noalias, dep *noalias_dep_lst);
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of internal data
+// ---------------------------------------------------------------------------
+
+int main()
+{
+  int i1,i2,i3,i4;
+  omp_set_num_threads(2);
+  #pragma omp parallel
+  {
+    #pragma omp single nowait
+    {
+      dep sdep[2];
+      int **ptr;
+      int gtid = __kmpc_global_thread_num(&loc);
+      int t = omp_get_thread_num();
+      #pragma omp task depend(in: i1, i2)
+      { int th = omp_get_thread_num();
+        printf("task 0_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(in: i1, i3)
+      { int th = omp_get_thread_num();
+        printf("task 1_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(in: i2) depend(out: i1)
+      { int th = omp_get_thread_num();
+        printf("task 2_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(in: i1)
+      { int th = omp_get_thread_num();
+        printf("task 3_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+      #pragma omp task depend(out: i2)
+      { int th = omp_get_thread_num();
+        printf("task 4_%d, th %d\n", t, th);
+        mysleep(DELAY+5); } // wait a bit longer than task 3
+      #pragma omp task depend(out: i3)
+      { int th = omp_get_thread_num();
+        printf("task 5_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+// compiler codegen start
+      // task1
+      ptr = __kmpc_omp_task_alloc(&loc, gtid, 0, 28, 16, thunk);
+      sdep[0].addr = (size_t)&i1;
+      sdep[0].len = 0;   // not used
+      sdep[0].flags = 4; // mx
+      sdep[1].addr = (size_t)&i4;
+      sdep[1].len = 0;   // not used
+      sdep[1].flags = 4; // mx
+      **ptr = t + 10; // init single shared variable
+      __kmpc_omp_task_with_deps(&loc, gtid, ptr, 2, sdep, 0, 0);
+
+      // task2
+      ptr = __kmpc_omp_task_alloc(&loc, gtid, 0, 28, 16, thunk);
+      // reverse pointers - library should sort them uniquely
+      sdep[0].addr = (size_t)&i4;
+      sdep[1].addr = (size_t)&i1;
+      **ptr = t + 20; // init single shared variable
+      __kmpc_omp_task_with_deps(&loc, gtid, ptr, 2, sdep, 0, 0);
+// compiler codegen end
+      #pragma omp task depend(in: i1)
+      { int th = omp_get_thread_num();
+        printf("task 8_%d, th %d\n", t, th);
+        mysleep(DELAY); }
+    } // single
+  } // parallel
+  if (err == 0) {
+    printf("passed\n");
+    return 0;
+  } else {
+    printf("failed\n");
+    return 1;
+  }
+}
diff --git a/final/runtime/test/tasking/omp_fill_taskqueue.c b/final/runtime/test/tasking/omp_fill_taskqueue.c
new file mode 100644
index 0000000..e95f97a
--- /dev/null
+++ b/final/runtime/test/tasking/omp_fill_taskqueue.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile && env KMP_ENABLE_TASK_THROTTLING=0 %libomp-run
+// RUN: %libomp-compile && env KMP_ENABLE_TASK_THROTTLING=1 %libomp-run
+
+#include<omp.h>
+#include<stdlib.h>
+#include<string.h>
+
+/**
+ * Test the task throttling behavior of the runtime.
+ * Unless OMP_NUM_THREADS is 1, the master thread pushes tasks to its own tasks
+ * queue until either of the following happens:
+ *   - the task queue is full, and it starts serializing tasks
+ *   - all tasks have been pushed, and it can begin execution
+ * The idea is to create a huge number of tasks which execution are blocked
+ * until the master thread comes to execute tasks (they need to be blocking,
+ * otherwise the second thread will start emptying the queue).
+ * At this point we can check the number of enqueued tasks: iff all tasks have
+ * been enqueued, then there was no task throttling.
+ * Otherwise there has been some sort of task throttling.
+ * If what we detect doesn't match the value of the environment variable, the
+ * test is failed.
+ */
+
+
+#define NUM_TASKS 2000
+
+
+int main()
+{
+  int i;
+  int block = 1;
+  int tid;
+  int throttling = strcmp(getenv("KMP_ENABLE_TASK_THROTTLING"), "1") == 0;
+  int enqueued = 0;
+  int failed = -1;
+
+  #pragma omp parallel num_threads(2)
+  #pragma omp master
+  {
+    for (i = 0; i < NUM_TASKS; i++) {
+      enqueued++;
+      #pragma omp task
+      {
+        tid = omp_get_thread_num();
+        if (tid == 0) {
+          // As soon as the master thread starts executing task we should unlock
+          // all tasks, and detect the test failure if it has not been done yet.
+          if (failed < 0)
+            failed = throttling ? enqueued == NUM_TASKS : enqueued < NUM_TASKS;
+          block = 0;
+        }
+        while (block)
+          ;
+      }
+    }
+    block = 0;
+  }
+
+  return failed;
+}
diff --git a/final/runtime/test/tasking/omp_task.c b/final/runtime/test/tasking/omp_task.c
new file mode 100644
index 0000000..5703225
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task()
+{
+  int tids[NUM_TASKS];
+  int i;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          my_sleep (SLEEPTIME);
+          tids[myi] = omp_get_thread_num();
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* Now we ckeck if more than one thread executed the tasks. */
+  for (i = 1; i < NUM_TASKS; i++) {
+    if (tids[0] != tids[i])
+      return 1;
+  }
+  return 0;
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  if (omp_get_max_threads() < 2)
+    omp_set_num_threads(8);
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_final.c b/final/runtime/test/tasking/omp_task_final.c
new file mode 100644
index 0000000..b531af6
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_final.c
@@ -0,0 +1,65 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task_final()
+{
+  int tids[NUM_TASKS];
+  int includedtids[NUM_TASKS];
+  int i;
+  int error = 0;
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+
+        #pragma omp task final(i>=10)
+        {
+          tids[myi] = omp_get_thread_num();
+          /* we generate included tasks for final tasks */
+          if(myi >= 10) {
+            int included = myi;
+            #pragma omp task
+            {
+              my_sleep (SLEEPTIME);
+              includedtids[included] = omp_get_thread_num();
+            } /* end of omp included task of the final task */
+            my_sleep (SLEEPTIME);
+          } /* end of if it is a final task*/
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* Now we ckeck if more than one thread executed the final task and its included task. */
+  for (i = 10; i < NUM_TASKS; i++) {
+    if (tids[i] != includedtids[i]) {
+      error++;
+    }
+  }
+  return (error==0);
+} /* end of check_paralel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_final()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
+
diff --git a/final/runtime/test/tasking/omp_task_firstprivate.c b/final/runtime/test/tasking/omp_task_firstprivate.c
new file mode 100644
index 0000000..d1f7c35
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_firstprivate.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_task_firstprivate()
+{
+  int i;
+  int sum = 1234;
+  int known_sum;
+  int result = 0; /* counts the wrong sums from tasks */
+
+  known_sum = 1234 + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task firstprivate(sum)
+        {
+          int j;
+          for (j = 0; j <= LOOPCOUNT; j++) {
+            #pragma omp flush
+            sum += j;
+          }
+
+          /* check if calculated sum was right */
+          if (sum != known_sum) {
+            #pragma omp critical
+            { result++; }
+          }
+        } /* omp task */
+      } /* for loop */
+    } /* omp single */
+  } /* omp parallel */
+  return (result == 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_if.c b/final/runtime/test/tasking/omp_task_if.c
new file mode 100644
index 0000000..8b4728e
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_if.c
@@ -0,0 +1,43 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_task_if()
+{
+  int condition_false;
+  int count;
+  int result;
+
+  count=0;
+  condition_false = (count == 1);
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      #pragma omp task if (condition_false) shared(count, result)
+      {
+        my_sleep (SLEEPTIME);
+        #pragma omp critical
+        result = (0 == count);
+      } /* end of omp task */
+      #pragma omp critical
+      count = 1;
+    } /* end of single */
+  } /*end of parallel */
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_imp_firstprivate.c b/final/runtime/test/tasking/omp_task_imp_firstprivate.c
new file mode 100644
index 0000000..905ab9a
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_imp_firstprivate.c
@@ -0,0 +1,47 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_imp_firstprivate()
+{
+  int i=5;
+  int k = 0;
+  int result = 0;
+  int task_result = 1;
+  #pragma omp parallel firstprivate(i)
+  {
+    #pragma omp single
+    {
+      for (k = 0; k < NUM_TASKS; k++) {
+        #pragma omp task shared(result , task_result)
+        {
+          int j;
+          //check if i is private
+          if(i != 5)
+            task_result = 0;
+          for(j = 0; j < NUM_TASKS; j++)
+            i++;
+          //this should be firstprivate implicitly
+        }
+      }
+      #pragma omp taskwait
+      result = (task_result && i==5);
+    }
+  }
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_imp_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_priority.c b/final/runtime/test/tasking/omp_task_priority.c
new file mode 100644
index 0000000..7b62360
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_priority.c
@@ -0,0 +1,22 @@
+// RUN: %libomp-compile && env OMP_MAX_TASK_PRIORITY=42 %libomp-run
+// Test OMP 4.5 task priorities
+// Currently only API function and envirable parsing implemented.
+// Test environment sets envirable: OMP_MAX_TASK_PRIORITY=42 as tested below.
+#include <stdio.h>
+#include <omp.h>
+
+int main (void) {
+    int passed;
+
+    passed = (omp_get_max_task_priority() == 42);
+    printf("Got %d\n", omp_get_max_task_priority());
+
+    if (passed) {
+       printf("passed\n");
+       return 0;
+    }
+
+    printf("failed\n");
+    return 1;
+}
+
diff --git a/final/runtime/test/tasking/omp_task_private.c b/final/runtime/test/tasking/omp_task_private.c
new file mode 100644
index 0000000..7a93716
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_private.c
@@ -0,0 +1,53 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_private()
+{
+  int i;
+  int known_sum;
+  int sum = 0;
+  int result = 0; /* counts the wrong sums from tasks */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        #pragma omp task private(sum) shared(result, known_sum)
+        {
+          int j;
+          //if sum is private, initialize to 0
+          sum = 0;
+          for (j = 0; j <= LOOPCOUNT; j++) {
+            #pragma omp flush
+            sum += j;
+          }
+          /* check if calculated sum was right */
+          if (sum != known_sum) {
+            #pragma omp critical
+            result++;
+          }
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel*/
+  return (result == 0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_task_shared.c b/final/runtime/test/tasking/omp_task_shared.c
new file mode 100644
index 0000000..0304026
--- /dev/null
+++ b/final/runtime/test/tasking/omp_task_shared.c
@@ -0,0 +1,41 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+int test_omp_task_imp_shared()
+{
+  int i;
+  int k = 0;
+  int result = 0;
+  i=0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    for (k = 0; k < NUM_TASKS; k++) {
+      #pragma omp task shared(i)
+      {
+        #pragma omp atomic
+        i++;
+        //this should be shared implicitly
+      }
+    }
+  }
+  result = i;
+  return ((result == NUM_TASKS));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_task_imp_shared()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskloop_grainsize.c b/final/runtime/test/tasking/omp_taskloop_grainsize.c
new file mode 100644
index 0000000..0833073
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskloop_grainsize.c
@@ -0,0 +1,113 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+// GCC 6 has support for taskloops, but at least 6.3.0 is crashing on this test
+// UNSUPPORTED: gcc-6
+
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_grainsize()
+{
+  int result = 0;
+  int i, grainsize, count, tmp_count, num_off;
+  int *tmp, *tids, *tidsArray;
+
+  tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+  tids = tidsArray;
+
+  for (grainsize = 1; grainsize < 48; ++grainsize) {
+    fprintf(stderr, "Grainsize %d\n", grainsize);
+    count = tmp_count = num_off = 0;
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      tids[i] = -1;
+    }
+
+    #pragma omp parallel shared(tids)
+    {
+      #pragma omp master
+      #pragma omp taskloop grainsize(grainsize)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      if (tids[i] == -1) {
+        fprintf(stderr, "  Iteration %d not touched!\n", i);
+        result++;
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    tmp = (int *)malloc(sizeof(int) * (count + 1));
+    tmp[0] = 1;
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tmp_count > count) {
+        printf("--------------------\nTestinternal Error: List too "
+               "small!!!\n--------------------\n");
+        break;
+      }
+      if (tids[i] != tids[i + 1]) {
+        tmp_count++;
+        tmp[tmp_count] = 1;
+      } else {
+        tmp[tmp_count]++;
+      }
+    }
+
+    // is grainsize statement working?
+    int num_tasks = CFDMAX_SIZE / grainsize;
+    int multiple1 = CFDMAX_SIZE / num_tasks;
+    int multiple2 = CFDMAX_SIZE / num_tasks + 1;
+    for (i = 0; i < count; i++) {
+      // it is possible for 2 adjacent chunks assigned to a same thread
+      if (tmp[i] % multiple1 != 0 && tmp[i] % multiple2 != 0) {
+        num_off++;
+      }
+    }
+
+    if (num_off > 1) {
+      fprintf(stderr, "  The number of bad chunks is %d\n", num_off);
+      result++;
+    } else {
+      fprintf(stderr, "  Everything ok\n");
+    }
+
+    free(tmp);
+  }
+  free(tidsArray);
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_grainsize()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskloop_num_tasks.c b/final/runtime/test/tasking/omp_taskloop_num_tasks.c
new file mode 100644
index 0000000..75cc337
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskloop_num_tasks.c
@@ -0,0 +1,77 @@
+// This test is known to be fragile on NetBSD kernel at the moment.
+// UNSUPPORTED: netbsd
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+
+// These compilers don't support the taskloop construct
+// UNSUPPORTED: gcc-4, gcc-5, icc-16
+
+// This test is known to be fragile on NetBSD kernel at the moment,
+// https://bugs.llvm.org/show_bug.cgi?id=42020.
+// UNSUPPORTED: netbsd
+
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_num_tasks()
+{
+  int i;
+  int *tids;
+  int *tidsArray;
+  int count;
+  int result = 0;
+  int num_tasks;
+
+  for (num_tasks = 1; num_tasks < 120; ++num_tasks) {
+    count = 0;
+    tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+    tids = tidsArray;
+
+    #pragma omp parallel shared(tids)
+    {
+      int i;
+      #pragma omp master
+      #pragma omp taskloop num_tasks(num_tasks)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    if (count > num_tasks) {
+      fprintf(stderr, "counted too many tasks: (wanted %d, got %d)\n",
+              num_tasks, count);
+      result++;
+    }
+  }
+
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_num_tasks()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskwait.c b/final/runtime/test/tasking/omp_taskwait.c
new file mode 100644
index 0000000..584eceb
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskwait.c
@@ -0,0 +1,78 @@
+// RUN: %libomp-compile-and-run
+
+// This test is known to be fragile on NetBSD kernel at the moment,
+// https://bugs.llvm.org/show_bug.cgi?id=42020.
+// UNSUPPORTED: netbsd
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_taskwait()
+{
+  int result1 = 0;   /* Stores number of not finished tasks after the taskwait */
+  int result2 = 0;   /* Stores number of wrong array elements at the end */
+  int array[NUM_TASKS];
+  int i;
+
+  /* fill array */
+  for (i = 0; i < NUM_TASKS; i++)
+    array[i] = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        /* First we have to store the value of the loop index in a new variable
+         * which will be private for each task because otherwise it will be overwritten
+         * if the execution of the task takes longer than the time which is needed to
+         * enter the next step of the loop!
+         */
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          my_sleep (SLEEPTIME);
+          array[myi] = 1;
+        } /* end of omp task */
+      } /* end of for */
+      #pragma omp taskwait
+      /* check if all tasks were finished */
+      for (i = 0; i < NUM_TASKS; i++)
+        if (array[i] != 1)
+          result1++;
+
+      /* generate some more tasks which now shall overwrite
+       * the values in the tids array */
+      for (i = 0; i < NUM_TASKS; i++) {
+        int myi;
+        myi = i;
+        #pragma omp task
+        {
+          array[myi] = 2;
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /*end of parallel */
+
+  /* final check, if all array elements contain the right values: */
+  for (i = 0; i < NUM_TASKS; i++) {
+    if (array[i] != 2)
+      result2++;
+  }
+  return ((result1 == 0) && (result2 == 0));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_taskwait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/tasking/omp_taskyield.c b/final/runtime/test/tasking/omp_taskyield.c
new file mode 100644
index 0000000..7f85413
--- /dev/null
+++ b/final/runtime/test/tasking/omp_taskyield.c
@@ -0,0 +1,61 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+int test_omp_taskyield()
+{
+  int i;
+  int count = 0;
+  int start_tid[NUM_TASKS];
+  int current_tid[NUM_TASKS];
+
+  for (i=0; i< NUM_TASKS; i++) {
+    start_tid[i]=0;
+    current_tid[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (i = 0; i < NUM_TASKS; i++) {
+        int myi = i;
+        #pragma omp task untied
+        {
+          my_sleep(SLEEPTIME);
+          start_tid[myi] = omp_get_thread_num();
+          #pragma omp taskyield
+          if((start_tid[myi] %2) ==0){
+            my_sleep(SLEEPTIME);
+            current_tid[myi] = omp_get_thread_num();
+          } /*end of if*/
+        } /* end of omp task */
+      } /* end of for */
+    } /* end of single */
+  } /* end of parallel */
+  for (i=0;i<NUM_TASKS; i++) {
+    //printf("start_tid[%d]=%d, current_tid[%d]=%d\n",
+      //i, start_tid[i], i , current_tid[i]);
+    if (current_tid[i] == start_tid[i])
+      count++;
+  }
+  return (count<NUM_TASKS);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  if (omp_get_max_threads() < 2)
+    omp_set_num_threads(8);
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_taskyield()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/threadprivate/omp_threadprivate.c b/final/runtime/test/threadprivate/omp_threadprivate.c
new file mode 100644
index 0000000..a3dd80d
--- /dev/null
+++ b/final/runtime/test/threadprivate/omp_threadprivate.c
@@ -0,0 +1,102 @@
+// RUN: %libomp-compile-and-run
+/*
+ * Threadprivate is tested in 2 ways:
+ * 1. The global variable declared as threadprivate should have
+ *  local copy for each thread. Otherwise race condition and
+ *  wrong result.
+ * 2. If the value of local copy is retained for the two adjacent
+ *  parallel regions
+ */
+#include "omp_testsuite.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+static int sum0=0;
+static int myvalue = 0;
+
+#pragma omp threadprivate(sum0)
+#pragma omp threadprivate(myvalue)
+
+int test_omp_threadprivate()
+{
+  int sum = 0;
+  int known_sum;
+  int i;
+  int iter;
+  int *data;
+  int size;
+  int num_failed = 0;
+  int my_random;
+  omp_set_dynamic(0);
+
+  #pragma omp parallel private(i)
+  {
+    sum0 = 0;
+    #pragma omp for
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      sum0 = sum0 + i;
+    } /*end of for*/
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    } /*end of critical */
+  } /* end of parallel */
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  if (known_sum != sum ) {
+    fprintf (stderr, " known_sum = %d, sum = %d\n", known_sum, sum);
+  }
+
+  /* the next parallel region is just used to get the number of threads*/
+  omp_set_dynamic(0);
+  #pragma omp parallel
+  {
+    #pragma omp master
+    {
+      size=omp_get_num_threads();
+      data=(int*) malloc(size*sizeof(int));
+    }
+  }/* end parallel*/
+
+  srand(45);
+  for (iter = 0; iter < 100; iter++) {
+    my_random = rand(); /* random number generator is
+                 called inside serial region*/
+
+    /* the first parallel region is used to initialiye myvalue
+       and the array with my_random+rank */
+    #pragma omp parallel
+    {
+      int rank;
+      rank = omp_get_thread_num ();
+      myvalue = data[rank] = my_random + rank;
+    }
+
+    /* the second parallel region verifies that the
+       value of "myvalue" is retained */
+    #pragma omp parallel reduction(+:num_failed)
+    {
+      int rank;
+      rank = omp_get_thread_num ();
+      num_failed = num_failed + (myvalue != data[rank]);
+      if(myvalue != data[rank]) {
+        fprintf (stderr, " myvalue = %d, data[rank]= %d\n",
+          myvalue, data[rank]);
+      }
+    }
+  }
+  free (data);
+  return (known_sum == sum) && !num_failed;
+} /* end of check_threadprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_threadprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/threadprivate/omp_threadprivate_for.c b/final/runtime/test/threadprivate/omp_threadprivate_for.c
new file mode 100644
index 0000000..3342e63
--- /dev/null
+++ b/final/runtime/test/threadprivate/omp_threadprivate_for.c
@@ -0,0 +1,48 @@
+// RUN: %libomp-compile-and-run
+#include "omp_testsuite.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+static int i;
+#pragma omp threadprivate(i)
+
+int test_omp_threadprivate_for()
+{
+  int known_sum;
+  int sum;
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  sum = 0;
+
+  #pragma omp parallel
+  {
+    int sum0 = 0, i0;
+    #pragma omp for
+    for (i0 = 1; i0 <= LOOPCOUNT; i0++) {
+      i = i0;
+      sum0 = sum0 + i;
+    }
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    }
+  } /* end of parallel */
+
+  if (known_sum != sum ) {
+    fprintf(stderr, " known_sum = %d, sum = %d\n", known_sum, sum);
+  }
+  return (known_sum == sum);
+} /* end of check_threadprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_threadprivate_for()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/bug_set_schedule_0.c b/final/runtime/test/worksharing/for/bug_set_schedule_0.c
new file mode 100644
index 0000000..889e239
--- /dev/null
+++ b/final/runtime/test/worksharing/for/bug_set_schedule_0.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_testsuite.h"
+
+/* Test that the chunk size is set to default (1) when
+   chunk size <= 0 is specified */
+int a = 0;
+
+int test_set_schedule_0()
+{
+  int i;
+  a = 0;
+  omp_set_schedule(omp_sched_dynamic,0);
+
+  #pragma omp parallel
+  {
+    #pragma omp for schedule(runtime)
+    for(i = 0; i < 10; i++) {
+      #pragma omp atomic
+      a++;
+      if(a > 10)
+        exit(1);
+    }
+  }
+  return a==10;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_set_schedule_0()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/kmp_doacross_check.c b/final/runtime/test/worksharing/for/kmp_doacross_check.c
new file mode 100644
index 0000000..59b61e3
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_doacross_check.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run
+// UNSUPPORTED: gcc
+// This test is incompatible with gcc because of the explicit call to
+// __kmpc_doacross_fini().  gcc relies on an implicit call to this function
+// when the last iteration is executed inside the GOMP_loop_*_next() functions.
+// Hence, in gcc, having the explicit call leads to __kmpc_doacross_fini()
+// being called twice.
+#include <stdio.h>
+
+#define N   1000
+
+struct dim {
+  long long lo; // lower
+  long long up; // upper
+  long long st; // stride
+};
+extern void __kmpc_doacross_init(void*, int, int, struct dim *);
+extern void __kmpc_doacross_wait(void*, int, long long*);
+extern void __kmpc_doacross_post(void*, int, long long*);
+extern void __kmpc_doacross_fini(void*, int);
+extern int __kmpc_global_thread_num(void*);
+
+int main()
+{
+  int i;
+  int iter[N];
+  struct dim dims;
+  for( i = 0; i < N; ++i )
+    iter[i] = 1;
+  dims.lo = 1;
+  dims.up = N-1;
+  dims.st = 1;
+  #pragma omp parallel num_threads(4)
+  {
+    int i, gtid;
+    long long vec;
+    gtid = __kmpc_global_thread_num(NULL);
+    __kmpc_doacross_init(NULL,gtid,1,&dims); // thread starts the loop
+    #pragma omp for nowait schedule(dynamic)
+    for( i = 1; i < N; ++i )
+    {
+      // runtime call corresponding to #pragma omp ordered depend(sink:i-1)
+      vec=i-1;
+      __kmpc_doacross_wait(NULL,gtid,&vec);
+      // user's code
+      iter[i] = iter[i-1] + 1;
+      // runtime call corresponding to #pragma omp ordered depend(source)
+      vec=i;
+      __kmpc_doacross_post(NULL,gtid,&vec);
+    }
+    // thread finishes the loop (should be before the loop barrier)
+    __kmpc_doacross_fini(NULL,gtid);
+  }
+  if( iter[N-1] == N ) {
+    printf("passed\n");
+  } else {
+    printf("failed %d != %d\n", iter[N-1], N);
+    return 1;
+  }
+  return 0;
+}
+
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c b/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c
new file mode 100644
index 0000000..5c6f94b
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_guided.c
@@ -0,0 +1,410 @@
+// RUN: %libomp-compile-and-run
+/*
+  Test for the 'schedule(simd:guided)' clause.
+  Compiler needs to generate a dynamic dispatching and pass the schedule
+  value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations.
+*/
+#include <stdio.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#endif
+
+// uncomment for debug diagnostics:
+//#define DEBUG
+
+#define SIMD_LEN 4
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+extern int __kmpc_global_thread_num(id*);
+extern void __kmpc_barrier(id*, int gtid);
+extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) {
+  int err = 0;
+  static int volatile loop_sync = 0;
+  i64 lb;   // Chunk lower bound
+  i64 ub;   // Chunk upper bound
+  i64 st;   // Chunk stride
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = tid;
+  int last;
+#if DEBUG
+  printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
+    (int)sizeof(i64), gtid, tid,
+    (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen
+  if (loop_st == 0)
+    return 0;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return 0;
+
+  __kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd,
+                         loop_lb, loop_ub, loop_st, loop_chunk);
+  if (tid == 0) {
+    // Let the master thread handle the chunks alone
+    int chunk;      // No of current chunk
+    i64 next_lb;    // Lower bound of the next chunk
+    i64 last_ub;    // Upper bound of the last processed chunk
+    u64 cur;        // Number of interations in  current chunk
+    u64 max;        // Max allowed iterations for current chunk
+    int undersized = 0;
+
+    chunk = 0;
+    next_lb = loop_lb;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations
+    while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if DEBUG
+      printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized
+      if (undersized) {
+        printf("Error with chunk %d\n", chunk);
+        err++;
+      }
+      // Check lower and upper bounds
+      if (lb != next_lb) {
+        printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
+        err++;
+      }
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub)) {
+          printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb <= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      } else {
+        if (!(ub >= loop_ub)) {
+          printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb >= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      }; // if
+      // Stride should not change
+      if (!(st == loop_st)) {
+        printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
+        err++;
+      }
+      cur = (ub - lb) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum
+      if (!(cur <= max + 1)) {
+        printf("Error with iter %d, %d\n", cur, max);
+        err++;
+      }
+      // Update maximum for the next chunk
+      if (cur < max)
+        max = cur;
+      next_lb = ub + loop_st;
+      last_ub = ub;
+      undersized = (cur < loop_chunk);
+    }; // while
+    // Must have at least one chunk
+    if (!(chunk > 0)) {
+      printf("Error with chunk %d\n", chunk);
+      err++;
+    }
+    // Must have the right last iteration index
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st > loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    } else {
+      if (!(last_ub >= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st < loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    }; // if
+    // Let non-master threads go
+    loop_sync = 1;
+  } else {
+    int i;
+    // Workers wait for master thread to finish, then call __kmpc_dispatch_next
+    for (i = 0; i < 1000000; ++ i) {
+      if (loop_sync != 0) {
+        break;
+      }; // if
+    }; // for i
+    while (loop_sync == 0) {
+      delay();
+    }; // while
+    // At this moment we do not have any more chunks -- all the chunks already
+    // processed by master thread
+    rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st);
+    if (rc) {
+      printf("Error return value\n");
+      err++;
+    }
+  }; // if
+
+  __kmpc_barrier(&loc, gtid);
+  if (tid == 0) {
+      loop_sync = 0;    // Restore original state
+#if DEBUG
+      printf("run_loop_64(): at the end\n");
+#endif
+  }; // if
+  __kmpc_barrier(&loc, gtid);
+  return err;
+} // run_loop
+
+// ---------------------------------------------------------------------------
+int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) {
+  int err = 0;
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound
+  int ub;   // Chunk upper bound
+  int st;   // Chunk stride
+  int rc;
+  int tid = omp_get_thread_num();
+  int gtid = tid;
+  int last;
+#if DEBUG
+  printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n",
+    (int)sizeof(int), gtid, tid,
+    (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen
+  if (loop_st == 0)
+    return 0;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return 0;
+
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd,
+                         loop_lb, loop_ub, loop_st, loop_chunk);
+  if (tid == 0) {
+    // Let the master thread handle the chunks alone
+    int chunk;      // No of current chunk
+    int next_lb;    // Lower bound of the next chunk
+    int last_ub;    // Upper bound of the last processed chunk
+    u64 cur;        // Number of interations in  current chunk
+    u64 max;        // Max allowed iterations for current chunk
+    int undersized = 0;
+
+    chunk = 0;
+    next_lb = loop_lb;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if DEBUG
+      printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub);
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized
+      if (undersized) {
+        printf("Error with chunk %d\n", chunk);
+        err++;
+      }
+      // Check lower and upper bounds
+      if (lb != next_lb) {
+        printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk);
+        err++;
+      }
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub)) {
+          printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb <= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      } else {
+        if (!(ub >= loop_ub)) {
+          printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk);
+          err++;
+        }
+        if (!(lb >= ub)) {
+          printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk);
+          err++;
+        }
+      }; // if
+      // Stride should not change
+      if (!(st == loop_st)) {
+        printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk);
+        err++;
+      }
+      cur = (ub - lb) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum
+      if (!(cur <= max + 1)) {
+        printf("Error with iter %d, %d\n", cur, max);
+        err++;
+      }
+      // Update maximum for the next chunk
+      if (cur < max)
+        max = cur;
+      next_lb = ub + loop_st;
+      last_ub = ub;
+      undersized = (cur < loop_chunk);
+    }; // while
+    // Must have at least one chunk
+    if (!(chunk > 0)) {
+      printf("Error with chunk %d\n", chunk);
+      err++;
+    }
+    // Must have the right last iteration index
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st > loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    } else {
+      if (!(last_ub >= loop_ub)) {
+        printf("Error with last1 %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_ub, chunk);
+        err++;
+      }
+      if (!(last_ub + loop_st < loop_ub)) {
+        printf("Error with last2 %d, %d, %d, ch %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk);
+        err++;
+      }
+    }; // if
+    // Let non-master threads go
+    loop_sync = 1;
+  } else {
+    int i;
+    // Workers wait for master thread to finish, then call __kmpc_dispatch_next
+    for (i = 0; i < 1000000; ++ i) {
+      if (loop_sync != 0) {
+        break;
+      }; // if
+    }; // for i
+    while (loop_sync == 0) {
+      delay();
+    }; // while
+    // At this moment we do not have any more chunks -- all the chunks already
+    // processed by the master thread
+    rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st);
+    if (rc) {
+      printf("Error return value\n");
+      err++;
+    }
+  }; // if
+
+  __kmpc_barrier(&loc, gtid);
+  if (tid == 0) {
+      loop_sync = 0;    // Restore original state
+#if DEBUG
+      printf("run_loop<>(): at the end\n");
+#endif
+  }; // if
+  __kmpc_barrier(&loc, gtid);
+  return err;
+} // run_loop
+
+// ---------------------------------------------------------------------------
+int run_64(int num_th)
+{
+ int err = 0;
+#pragma omp parallel num_threads(num_th)
+ {
+  int chunk;
+  i64 st, lb, ub;
+  for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
+    for (st = 1; st <= 3; ++ st) {
+      for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
+        for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
+          err += run_loop_64(lb, ub,  st, chunk);
+          err += run_loop_64(ub, lb, -st, chunk);
+        }; // for ub
+      }; // for lb
+    }; // for st
+  }; // for chunk
+ }
+ return err;
+} // run_all
+
+int run_32(int num_th)
+{
+ int err = 0;
+#pragma omp parallel num_threads(num_th)
+ {
+  int chunk, st, lb, ub;
+  for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) {
+    for (st = 1; st <= 3; ++ st) {
+      for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) {
+        for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) {
+          err += run_loop_32(lb, ub,  st, chunk);
+          err += run_loop_32(ub, lb, -st, chunk);
+        }; // for ub
+      }; // for lb
+    }; // for st
+  }; // for chunk
+ }
+ return err;
+} // run_all
+
+// ---------------------------------------------------------------------------
+int main()
+{
+  int n, err = 0;
+  for (n = 1; n <= 4; ++ n) {
+    err += run_32(n);
+    err += run_64(n);
+  }; // for n
+  if (err)
+    printf("failed with %d errors\n", err);
+  else
+    printf("passed\n");
+  return err;
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
new file mode 100644
index 0000000..987a5c0
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c
@@ -0,0 +1,222 @@
+// RUN: %libomp-compile-and-run
+
+// The test checks schedule(simd:runtime)
+// in combination with omp_set_schedule()
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int nthreads = omp_get_num_threads();
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (last) {
+        if (!no_chunk && cur > ch && nthreads > 1)
+          printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+                 (int)cur, ch, tid, ++err);
+      } else {
+        if (cur % ch)
+          printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+                 chunk, (int)cur, ch, tid, ++err);
+      }
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+// static (no chunk)
+  omp_set_schedule(omp_sched_static,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// auto (chunk should be ignorted)
+  omp_set_schedule(omp_sched_auto,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// static,1
+  chunk = 1;
+  omp_set_schedule(omp_sched_static,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// dynamic,1
+  omp_set_schedule(omp_sched_dynamic,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// guided,1
+  omp_set_schedule(omp_sched_guided,1);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// dynamic,0 - use default chunk size 1
+  omp_set_schedule(omp_sched_dynamic,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+// guided,0 - use default chunk size 1
+  omp_set_schedule(omp_sched_guided,0);
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
new file mode 100644
index 0000000..5dfaf24
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c
@@ -0,0 +1,197 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=guided    %libomp-run
+// RUN: env OMP_SCHEDULE=guided,1  %libomp-run 1
+// RUN: env OMP_SCHEDULE=guided,2  %libomp-run 2
+// RUN: env OMP_SCHEDULE=dynamic   %libomp-run
+// RUN: env OMP_SCHEDULE=dynamic,1 %libomp-run 1
+// RUN: env OMP_SCHEDULE=dynamic,2 %libomp-run 2
+// RUN: env OMP_SCHEDULE=auto      %libomp-run
+
+// The test checks schedule(simd:runtime)
+// in combination with OMP_SCHEDULE=guided[,chunk]
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define UBOUND 100
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int nthreads = omp_get_num_threads();
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (!last && cur % ch)
+        printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+               chunk, (int)cur, ch, tid, ++err);
+      if (last && !no_chunk && cur > ch && nthreads > 1)
+        printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+               (int)cur, ch, tid, ++err);
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+  if (argc > 1) {
+    // expect chunk size as a parameter
+    chunk = atoi(argv[1]);
+  }
+#pragma omp parallel //num_threads(num_th)
+  run_loop(0, UBOUND, 1, chunk);
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
new file mode 100644
index 0000000..d76046b
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c
@@ -0,0 +1,202 @@
+// RUN: %libomp-compile && %libomp-run
+// RUN: %libomp-run 1 && %libomp-run 2
+
+// The test checks schedule(simd:runtime)
+// in combination with OMP_SCHEDULE=static[,chunk]
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#define delay() Sleep(1);
+#define seten(a,b,c) _putenv_s((a),(b))
+#else
+#include <unistd.h>
+#define delay() usleep(10);
+#define seten(a,b,c) setenv((a),(b),(c))
+#endif
+
+#define SIMD_LEN 4
+int err = 0;
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL.
+enum sched {
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,
+  kmp_sch_runtime_simd = 47,
+};
+typedef unsigned u32;
+typedef long long i64;
+typedef unsigned long long u64;
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} id;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  int __kmpc_global_thread_num(id*);
+  void __kmpc_barrier(id*, int gtid);
+  void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int);
+  void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64);
+  int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*);
+  int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+static id loc = {0, 2, 0, 0, ";file;func;0;0;;"};
+
+// ---------------------------------------------------------------------------
+void
+run_loop(
+    int loop_lb,   // Loop lower bound.
+    int loop_ub,   // Loop upper bound.
+    int loop_st,   // Loop stride.
+    int lchunk
+) {
+  static int volatile loop_sync = 0;
+  int lb;   // Chunk lower bound.
+  int ub;   // Chunk upper bound.
+  int st;   // Chunk stride.
+  int rc;
+  int nthreads = omp_get_num_threads();
+  int tid = omp_get_thread_num();
+  int gtid = __kmpc_global_thread_num(&loc);
+  int last;
+  int tc = (loop_ub - loop_lb) / loop_st + 1;
+  int ch;
+  int no_chunk = 0;
+  if (lchunk == 0) {
+    no_chunk = 1;
+    lchunk = 1;
+  }
+  ch = lchunk * SIMD_LEN;
+#if _DEBUG > 1
+  printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n",
+         gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk);
+#endif
+  // Don't test degenerate cases that should have been discovered by codegen.
+  if (loop_st == 0)
+    return;
+  if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub)
+    return;
+  __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd,
+                         loop_lb, loop_ub, loop_st, SIMD_LEN);
+  {
+    // Let the master thread handle the chunks alone.
+    int chunk;      // No of current chunk.
+    int last_ub;    // Upper bound of the last processed chunk.
+    u64 cur;        // Number of interations in  current chunk.
+    u64 max;        // Max allowed iterations for current chunk.
+    int undersized = 0;
+    last_ub = loop_ub;
+    chunk = 0;
+    max = (loop_ub - loop_lb) / loop_st + 1;
+    // The first chunk can consume all iterations.
+    while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) {
+      ++ chunk;
+#if _DEBUG
+      printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n",
+             tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1));
+#endif
+      // Check if previous chunk (it is not the final chunk) is undersized.
+      if (undersized)
+        printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err);
+      if (loop_st > 0) {
+        if (!(ub <= loop_ub))
+          printf("Error with ub %d, %d, ch %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb <= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      } else {
+        if (!(ub >= loop_ub))
+          printf("Error with ub %d, %d, %d, err %d\n",
+                 (int)ub, (int)loop_ub, chunk, ++err);
+        if (!(lb >= ub))
+          printf("Error with bounds %d, %d, %d, err %d\n",
+                 (int)lb, (int)ub, chunk, ++err);
+      }; // if
+      // Stride should not change.
+      if (!(st == loop_st))
+        printf("Error with st %d, %d, ch %d, err %d\n",
+               (int)st, (int)loop_st, chunk, ++err);
+      cur = ( ub - lb ) / loop_st + 1;
+      // Guided scheduling uses FP computations, so current chunk may
+      // be a bit bigger (+1) than allowed maximum.
+      if (!( cur <= max + 1))
+        printf("Error with iter %d, %d, err %d\n", cur, max, ++err);
+      // Update maximum for the next chunk.
+      if (last) {
+        if (!no_chunk && cur > ch && nthreads > 1)
+          printf("Error: too big last chunk %d (%d), tid %d, err %d\n",
+                 (int)cur, ch, tid, ++err);
+      } else {
+        if (cur % ch)
+          printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n",
+                 chunk, (int)cur, ch, tid, ++err);
+      }
+      if (cur < max)
+        max = cur;
+      last_ub = ub;
+      undersized = (cur < ch);
+#if _DEBUG > 1
+      if (last)
+        printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n",
+               undersized,cur,ch,tid,ub,lb,loop_st);
+#endif
+    } // while
+    // Must have the right last iteration index.
+    if (loop_st > 0) {
+      if (!(last_ub <= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st > loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } else {
+      if (!(last_ub >= loop_ub))
+        printf("Error with last1 %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_ub, chunk, ++err);
+      if (last && !(last_ub + loop_st < loop_ub))
+        printf("Error with last2 %d, %d, %d, ch %d, err %d\n",
+               (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err);
+    } // if
+  }
+  __kmpc_barrier(&loc, gtid);
+} // run_loop
+
+int main(int argc, char *argv[])
+{
+  int chunk = 0;
+  if (argc > 1) {
+    char *buf = malloc(8 + strlen(argv[1]));
+    // expect chunk size as a parameter
+    chunk = atoi(argv[1]);
+    strcpy(buf,"static,");
+    strcat(buf,argv[1]);
+    seten("OMP_SCHEDULE",buf,1);
+    printf("Testing schedule(simd:%s)\n", buf);
+    free(buf);
+  } else {
+    seten("OMP_SCHEDULE","static",1);
+    printf("Testing schedule(simd:static)\n");
+  }
+#pragma omp parallel// num_threads(num_th)
+  run_loop(0, 26, 1, chunk);
+  if (err) {
+    printf("failed, err = %d\n", err);
+    return 1;
+  } else {
+    printf("passed\n");
+    return 0;
+  }
+}
diff --git a/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c b/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
new file mode 100644
index 0000000..a6378fe
--- /dev/null
+++ b/final/runtime/test/worksharing/for/kmp_set_dispatch_buf.c
@@ -0,0 +1,91 @@
+// RUN: %libomp-compile && %libomp-run 7
+// RUN: %libomp-run 0 && %libomp-run -1
+// RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run 7
+// RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 7
+#define MY_MAX 200
+#define MY_MIN -200
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE dynamic
+#endif
+
+int num_disp_buffers, num_loops;
+int a, b, a_known_value, b_known_value;
+
+int test_kmp_set_disp_num_buffers()
+{
+  int success = 1;
+  a = 0;
+  b = 0;
+  // run many small dynamic loops to stress the dispatch buffer system
+  #pragma omp parallel
+  {
+    int i,j;
+    for (j = 0; j < num_loops; j++) {
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+      }
+      #pragma omp for schedule(MY_SCHEDULE) nowait
+      for (i = MY_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+      }
+    }
+  }
+  // detect failure
+  if (a != a_known_value || b != b_known_value) {
+    success = 0;
+    printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value,
+           b, b_known_value);
+  }
+  return success;
+}
+
+int main(int argc, char** argv)
+{
+  int i,j;
+  int num_failed=0;
+
+  if (argc != 2) {
+    fprintf(stderr, "usage: %s num_disp_buffers\n", argv[0]);
+    exit(1);
+  }
+
+  // set the number of dispatch buffers
+  num_disp_buffers = atoi(argv[1]);
+  kmp_set_disp_num_buffers(num_disp_buffers);
+
+  // figure out the known values to compare with calculated result
+  a_known_value = 0;
+  b_known_value = 0;
+
+  // if specified to use bad num_disp_buffers set num_loops
+  // to something reasonable
+  if (num_disp_buffers <= 0)
+    num_loops = 10;
+  else
+    num_loops = num_disp_buffers*10;
+
+  for (j = 0; j < num_loops; j++) {
+    for (i = MY_MIN; i < MY_MAX; i+=INCR)
+      a_known_value++;
+    for (i = MY_MAX; i >= MY_MIN; i-=INCR)
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_kmp_set_disp_num_buffers()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_doacross.c b/final/runtime/test/worksharing/for/omp_doacross.c
new file mode 100644
index 0000000..3644306
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_doacross.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile-and-run
+// XFAIL: gcc-4, gcc-5, clang-3.7, clang-3.8, icc-15, icc-16
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#ifndef N
+#define N 750
+#endif
+
+int test_doacross() {
+  int i, j;
+  // Allocate and zero out the matrix
+  int *m = (int *)malloc(sizeof(int) * N * N);
+  for (i = 0; i < N; ++i) {
+    for (j = 0; j < N; ++j) {
+      m[i * N + j] = 0;
+    }
+  }
+  // Have first row and column be 0, 1, 2, 3, etc.
+  for (i = 0; i < N; ++i)
+    m[i * N] = i;
+  for (j = 0; j < N; ++j)
+    m[j] = j;
+  // Perform wavefront which results in matrix:
+  // 0 1 2 3 4
+  // 1 2 3 4 5
+  // 2 3 4 5 6
+  // 3 4 5 6 7
+  // 4 5 6 7 8
+  #pragma omp parallel shared(m)
+  {
+    int row, col;
+    #pragma omp for ordered(2)
+    for (row = 1; row < N; ++row) {
+      for (col = 1; col < N; ++col) {
+        #pragma omp ordered depend(sink : row - 1, col) depend(sink : row, col - 1)
+        m[row * N + col] = m[(row - 1) * N + col] + m[row * N + (col - 1)] -
+                           m[(row - 1) * N + (col - 1)];
+        #pragma omp ordered depend(source)
+      }
+    }
+  }
+
+  // Check the bottom right element to see if iteration dependencies were held
+  int retval = (m[(N - 1) * N + N - 1] == 2 * (N - 1));
+  free(m);
+  return retval;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  int num_failed = 0;
+  if (omp_get_max_threads() < 2)
+    omp_set_num_threads(4);
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_doacross()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_bigbounds.c b/final/runtime/test/worksharing/for/omp_for_bigbounds.c
new file mode 100644
index 0000000..901d760
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_bigbounds.c
@@ -0,0 +1,70 @@
+// RUN: %libomp-compile -DMY_SCHEDULE=static && %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=dynamic && %libomp-run
+// RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run
+
+// Only works with Intel Compiler since at least version 15.0
+// XFAIL: gcc, clang
+
+/*
+ * Test that large bounds are handled properly and calculations of
+ * loop iterations don't accidently overflow
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "omp_testsuite.h"
+
+#define INCR 50000000
+#define MY_MAX 2000000000
+#define MY_MIN -2000000000
+#ifndef MY_SCHEDULE
+# define MY_SCHEDULE static
+#endif
+
+int a, b, a_known_value, b_known_value;
+
+int test_omp_for_bigbounds()
+{
+  a = 0;
+  b = 0;
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for schedule(MY_SCHEDULE)
+    for (i = INT_MIN; i < MY_MAX; i+=INCR) {
+        #pragma omp atomic
+        a++;
+    }
+    #pragma omp for schedule(MY_SCHEDULE)
+    for (i = INT_MAX; i >= MY_MIN; i-=INCR) {
+        #pragma omp atomic
+        b++;
+    }
+  }
+  printf("a = %d (should be %d), b = %d (should be %d)\n", a, a_known_value, b, b_known_value);
+  return (a == a_known_value && b == b_known_value);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  a_known_value = 0;
+  for (i = INT_MIN; i < MY_MAX; i+=INCR) {
+      a_known_value++;
+  }
+
+  b_known_value = 0;
+  for (i = INT_MAX; i >= MY_MIN; i-=INCR) {
+      b_known_value++;
+  }
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_bigbounds()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_collapse.c b/final/runtime/test/worksharing/for/omp_for_collapse.c
new file mode 100644
index 0000000..a08086d
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_collapse.c
@@ -0,0 +1,51 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function to check that i is increasing monotonically
+   with each call */
+static int check_i_islarger (int i)
+{
+  static int last_i;
+  int islarger;
+  if (i==1)
+    last_i=0;
+  islarger = ((i >= last_i)&&(i - last_i<=1));
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_for_collapse()
+{
+  int is_larger = 1;
+
+  #pragma omp parallel
+  {
+    int i,j;
+    int my_islarger = 1;
+    #pragma omp for private(i,j) schedule(static,1) collapse(2) ordered
+    for (i = 1; i < 100; i++) {
+      for (j =1; j <100; j++) {
+        #pragma omp ordered
+        my_islarger = check_i_islarger(i)&&my_islarger;
+      }
+    }
+    #pragma omp critical
+    is_larger = is_larger && my_islarger;
+  }
+  return (is_larger);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_collapse()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_firstprivate.c b/final/runtime/test/worksharing/for/omp_for_firstprivate.c
new file mode 100644
index 0000000..6c4121c
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_firstprivate.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+  sum1 = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      threadsnum=omp_get_num_threads();
+    }
+    /* sum0 = 0; */
+
+    int i;
+    #pragma omp for firstprivate(sum0)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }  /* end of for */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /* end of critical */
+  }  /* end of parallel */
+  known_sum = 12345* threadsnum+ (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_lastprivate.c b/final/runtime/test/worksharing/for/omp_for_lastprivate.c
new file mode 100644
index 0000000..88694b8
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_lastprivate.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum0;
+#pragma omp threadprivate(sum0)
+
+int test_omp_for_lastprivate()
+{
+  int sum = 0;
+  int known_sum;
+  int i0;
+
+  i0 = -1;
+
+  #pragma omp parallel
+  {
+    sum0 = 0;
+    {  /* Begin of orphaned block */
+      int i;
+      #pragma omp for schedule(static,7) lastprivate(i0)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum0 + i;
+        i0 = i;
+      }  /* end of for */
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum0;
+    }  /* end of critical */
+  }  /* end of parallel */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  fprintf(stderr, "known_sum = %d , sum = %d\n",known_sum,sum);
+  fprintf(stderr, "LOOPCOUNT = %d , i0 = %d\n",LOOPCOUNT,i0);
+  return ((known_sum == sum) && (i0 == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_nowait.c b/final/runtime/test/worksharing/for/omp_for_nowait.c
new file mode 100644
index 0000000..95a4775
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_nowait.c
@@ -0,0 +1,77 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * This test will hang if the nowait is not working properly.
+ *
+ * It relies on a thread skipping to the second for construct to
+ * release the threads in the first for construct.
+ *
+ * Also, we use static scheduling to guarantee that one
+ * thread will make it to the second for construct.
+ */
+volatile int release;
+volatile int count;
+
+void wait_for_release_then_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d enters first for construct"
+    " and waits.\n", rank);
+  while (release == 0);
+  #pragma omp atomic
+  count++;
+}
+
+void release_and_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d sets release to 1\n", rank);
+  release = 1;
+  #pragma omp atomic
+  count++;
+}
+
+int test_omp_for_nowait()
+{
+  release = 0;
+  count = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    int rank;
+    int i;
+
+    rank = omp_get_thread_num();
+
+    #pragma omp for schedule(static) nowait
+    for (i = 0; i < 4; i++) {
+      if (i < 3)
+        wait_for_release_then_increment(rank);
+      else {
+        fprintf(stderr, "Thread nr %d enters first for and goes "
+          "immediately to the next for construct to release.\n", rank);
+        #pragma omp atomic
+        count++;
+      }
+    }
+
+    #pragma omp for schedule(static)
+    for (i = 0; i < 4; i++) {
+        release_and_increment(rank);
+    }
+  }
+  return (count==8);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_nowait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_ordered.c b/final/runtime/test/worksharing/for/omp_for_ordered.c
new file mode 100644
index 0000000..18ac7eb
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_ordered.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+/* Utility function to check that i is increasing monotonically
+   with each call */
+static int check_i_islarger (int i)
+{
+  int islarger;
+  islarger = (i > last_i);
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_for_ordered()
+{
+  int sum;
+  int is_larger = 1;
+  int known_sum;
+
+  last_i = 0;
+  sum = 0;
+
+  #pragma omp parallel
+  {
+    int i;
+    int my_islarger = 1;
+    #pragma omp for schedule(static,1) ordered
+    for (i = 1; i < 100; i++) {
+      #pragma omp ordered
+      {
+        my_islarger = check_i_islarger(i) && my_islarger;
+        sum = sum + i;
+      }
+    }
+    #pragma omp critical
+    {
+      is_larger = is_larger && my_islarger;
+    }
+  }
+
+  known_sum=(99 * 100) / 2;
+  return ((known_sum == sum) && is_larger);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_ordered()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_private.c b/final/runtime/test/worksharing/for/omp_for_private.c
new file mode 100644
index 0000000..1f537b9
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_private.c
@@ -0,0 +1,63 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/* Utility function do spend some time in a loop */
+static void do_some_work()
+{
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++){
+  sum += sqrt ((double) i);
+  }
+}
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_private()
+{
+  int sum = 0;
+  int sum0;
+  int known_sum;
+
+  sum0 = 0;  /* setting (global) sum0 = 0 */
+
+  #pragma omp parallel
+  {
+    sum1 = 0;  /* setting sum1 in each thread to 0 */
+    {  /* begin of orphaned block */
+      int i;
+      #pragma omp for private(sum0) schedule(static,1)
+      for (i = 1; i <= LOOPCOUNT; i++) {
+        sum0 = sum1;
+        #pragma omp flush
+        sum0 = sum0 + i;
+        do_some_work ();
+        #pragma omp flush
+        sum1 = sum0;
+      }
+    }  /* end of orphaned block */
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }  /*end of critical*/
+  }  /* end of parallel*/
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_reduction.c b/final/runtime/test/worksharing/for/omp_for_reduction.c
new file mode 100644
index 0000000..28f0907
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_reduction.c
@@ -0,0 +1,339 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20  /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_for_reduction ()
+{
+  double dt;
+  int sum;
+  int diff;
+  int product = 1;
+  double dsum;
+  double dknown_sum;
+  double ddiff;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int *logics;
+  int i;
+  int known_sum;
+  int known_product;
+  double rounding_error = 1.E-9; /* over all rounding error to be
+                    ignored in the double tests */
+  double dpt;
+  int result = 0;
+  int logicsArray[LOOPCOUNT];
+
+  /* Variables for integer tests */
+  sum = 0;
+  product = 1;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  /* variabels for double tests */
+  dt = 1. / 3.;  /* base of geometric row for + and - test*/
+  dsum = 0.;
+  /* Variabeles for logic  tests */
+  logics = logicsArray;
+  logic_and = 1;
+  logic_or = 0;
+  /* Variabeles for bit operators tests */
+  bit_and = 1;
+  bit_or = 0;
+  /* Variables for exclusiv bit or */
+  exclusiv_bit_or = 0;
+
+  /************************************************************************/
+  /** Tests for integers                         **/
+  /************************************************************************/
+
+  /**** Testing integer addition ****/
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(+:sum)
+    for (j = 1; j <= LOOPCOUNT; j++) {
+      sum = sum + j;
+    }
+  }
+  if (known_sum != sum) {
+    result++;
+    fprintf (stderr, "Error in sum with integers: Result was %d"
+      " instead of %d.\n", sum, known_sum);
+  }
+
+  /**** Testing integer subtracton ****/
+  diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(-:diff)
+    for (j = 1; j <= LOOPCOUNT; j++) {
+      diff = diff - j;
+    }
+  }
+  if (diff != 0) {
+    result++;
+    fprintf (stderr, "Error in difference with integers: Result was %d"
+      " instead of 0.\n", diff);
+  }
+
+  /**** Testing integer multiplication ****/
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(*:product)
+    for (j = 1; j <= MAX_FACTOR; j++) {
+      product *= j;
+    }
+  }
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf (stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n",product,known_product);
+  }
+
+  /************************************************************************/
+  /** Tests for doubles                          **/
+  /************************************************************************/
+
+  /**** Testing double addition ****/
+  dsum = 0.;
+  dpt = 1.;
+  for (i = 0; i < DOUBLE_DIGITS; ++i) {
+    dpt *= dt;
+  }
+  dknown_sum = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(+:dsum)
+    for (j = 0; j < DOUBLE_DIGITS; j++) {
+      dsum += pow (dt, j);
+    }
+  }
+  if (fabs (dsum - dknown_sum) > rounding_error) {
+    result++;
+    fprintf (stderr, "\nError in sum with doubles: Result was %f"
+      " instead of: %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  /**** Testing double subtraction ****/
+  ddiff = (1 - dpt) / (1 - dt);
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(-:ddiff)
+    for (j = 0; j < DOUBLE_DIGITS; ++j) {
+      ddiff -= pow (dt, j);
+    }
+  }
+  if (fabs (ddiff) > rounding_error) {
+    result++;
+    fprintf (stderr, "Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n", ddiff);
+  }
+
+
+  /************************************************************************/
+  /** Tests for logical values                       **/
+  /************************************************************************/
+
+  /**** Testing logic and ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&&:logic_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_and = (logic_and && logics[j]);
+    }
+  }
+  if(!logic_and) {
+    result++;
+    fprintf (stderr, "Error in logic AND part 1\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&&:logic_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_and = logic_and && logics[j];
+    }
+  }
+  if(logic_and) {
+    result++;
+    fprintf (stderr, "Error in logic AND part 2\n");
+  }
+
+  /**** Testing logic or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(||:logic_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_or = logic_or || logics[j];
+    }
+  }
+  if (logic_or) {
+    result++;
+    fprintf (stderr, "Error in logic OR part 1\n");
+  }
+
+  logic_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(||:logic_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      logic_or = logic_or || logics[j];
+    }
+  }
+  if(!logic_or) {
+    result++;
+    fprintf (stderr, "Error in logic OR part 2\n");
+  }
+
+  /************************************************************************/
+  /** Tests for bit values                         **/
+  /************************************************************************/
+
+  /**** Testing bit and ****/
+  for (i = 0; i < LOOPCOUNT; ++i) {
+    logics[i] = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&:bit_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_and = (bit_and & logics[j]);
+    }
+  }
+  if (!bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 1\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT / 2] = 0;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(&:bit_and)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_and = bit_and & logics[j];
+    }
+  }
+  if (bit_and) {
+    result++;
+    fprintf (stderr, "Error in BIT AND part 2\n");
+  }
+
+  /**** Testing bit or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(|:bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_or = bit_or | logics[j];
+    }
+  }
+  if (bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 1\n");
+  }
+
+  bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(|:bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      bit_or = bit_or | logics[j];
+    }
+  }
+  if (!bit_or) {
+    result++;
+    fprintf (stderr, "Error in BIT OR part 2\n");
+  }
+
+  /**** Testing exclusive bit or ****/
+  for (i = 0; i < LOOPCOUNT; i++) {
+    logics[i] = 0;
+  }
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(^:exclusiv_bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+    }
+  }
+  if (exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT / 2] = 1;
+
+  #pragma omp parallel
+  {
+    int j;
+    #pragma omp for schedule(dynamic,1) reduction(^:exclusiv_bit_or)
+    for (j = 0; j < LOOPCOUNT; ++j) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[j];
+    }
+  }
+  if (!exclusiv_bit_or) {
+    result++;
+    fprintf (stderr, "Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  return (result == 0);
+  free (logics);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_auto.c b/final/runtime/test/worksharing/for/omp_for_schedule_auto.c
new file mode 100644
index 0000000..075617c
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_auto.c
@@ -0,0 +1,69 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum1;
+#pragma omp threadprivate(sum1)
+
+int test_omp_for_auto()
+{
+  int j;
+  int sum;
+  int sum0;
+  int known_sum;
+  int threadsnum;
+
+  sum = 0;
+  sum0 = 12345;
+
+  // array which keeps track of which threads participated in the for loop
+  // e.g., given 4 threads, [ 0 | 1 | 1 | 0 ] implies
+  //       threads 0 and 3 did not, threads 1 and 2 did
+  int max_threads = omp_get_max_threads();
+  int* active_threads = (int*)malloc(sizeof(int)*max_threads);
+  for(j = 0; j < max_threads; j++)
+    active_threads[j] = 0;
+
+  #pragma omp parallel
+  {
+    int i;
+    sum1 = 0;
+    #pragma omp for firstprivate(sum0) schedule(auto)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+      active_threads[omp_get_thread_num()] = 1;
+      sum0 = sum0 + i;
+      sum1 = sum0;
+    }
+
+    #pragma omp critical
+    {
+      sum = sum + sum1;
+    }
+  }
+
+  // count the threads that participated (sum is stored in threadsnum)
+  threadsnum=0;
+  for(j = 0; j < max_threads; j++) {
+    if(active_threads[j])
+      threadsnum++;
+  }
+  free(active_threads);
+
+  known_sum = 12345 * threadsnum + (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_auto()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c b/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
new file mode 100644
index 0000000..6d4f59b
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
@@ -0,0 +1,89 @@
+// RUN: %libomp-compile-and-run
+/*
+ * Test for dynamic scheduling with chunk size
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested chunk size
+ *     unless it is the last one.
+ * It is possible for two adjacent chunks are assigned to the same thread
+ * Modified by Chunhua Liao
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 100
+const int chunk_size = 7;
+
+int test_omp_for_schedule_dynamic()
+{
+  int tid;
+  int *tids;
+  int i;
+  int tidsArray[CFDMAX_SIZE];
+  int count = 0;
+  int tmp_count = 0; /*dispatch times*/
+  int *tmp;  /*store chunk size for each dispatch*/
+  int result = 0;
+
+  tids = tidsArray;
+
+  #pragma omp parallel private(tid) shared(tids)
+  {        /* begin of parallel */
+    int tid;
+    tid = omp_get_thread_num ();
+    #pragma omp for schedule(dynamic,chunk_size)
+    for (i = 0; i < CFDMAX_SIZE; i++) {
+      tids[i] = tid;
+    }
+  }
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+    if (tids[i] != tids[i + 1]) {
+      count++;
+    }
+  }
+
+  tmp = (int *) malloc (sizeof (int) * (count + 1));
+  tmp[0] = 1;
+
+  for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+    if (tmp_count > count) {
+      printf ("--------------------\nTestinternal Error: List too small!!!\n--------------------\n");  /* Error handling */
+      break;
+    }
+    if (tids[i] != tids[i + 1]) {
+      tmp_count++;
+      tmp[tmp_count] = 1;
+    } else {
+      tmp[tmp_count]++;
+    }
+  }
+  /* is dynamic statement working? */
+  for (i = 0; i < count; i++) {
+    if ((tmp[i]%chunk_size)!=0) {
+      /* it is possible for 2 adjacent chunks assigned to a same thread */
+      result++;
+      fprintf(stderr,"The intermediate dispatch has wrong chunksize.\n");
+      /* result += ((tmp[i] / chunk_size) - 1); */
+    }
+  }
+  if ((tmp[count]%chunk_size)!=(CFDMAX_SIZE%chunk_size)) {
+    result++;
+    fprintf(stderr,"the last dispatch has wrong chunksize.\n");
+  }
+  /* for (int i=0;i<count+1;++i) printf("%d\t:=\t%d\n",i+1,tmp[i]); */
+  return (result==0);
+}
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_dynamic()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_guided.c b/final/runtime/test/worksharing/for/omp_for_schedule_guided.c
new file mode 100644
index 0000000..1ee7449
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_guided.c
@@ -0,0 +1,217 @@
+// RUN: %libomp-compile-and-run
+
+/* Test for guided scheduling
+ * Ensure threads get chunks interleavely first
+ * Then judge the chunk sizes are decreasing to a stable value
+ * Modified by Chunhua Liao
+ * For example, 100 iteration on 2 threads, chunksize 7
+ * one line for each dispatch, 0/1 means thread id
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  24
+ * 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1        18
+ * 0 0 0 0 0 0 0 0 0 0 0 0 0 0            14
+ * 1 1 1 1 1 1 1 1 1 1                10
+ * 0 0 0 0 0 0 0 0                   8
+ * 1 1 1 1 1 1 1                   7
+ * 0 0 0 0 0 0 0                   7
+ * 1 1 1 1 1 1 1                   7
+ * 0 0 0 0 0                     5
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME  0.005
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0001
+#endif
+
+int test_omp_for_schedule_guided()
+{
+  int * tids;
+  int * chunksizes;
+  int notout;
+  int maxiter;
+  int threads;
+  int i;
+  int result;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  maxiter = 0;
+  result = 1;
+  notout = 1;
+
+  /* Testing if enough threads are available for this check. */
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      threads = omp_get_num_threads();
+    }
+  }
+
+  /* ensure there are at least two threads */
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+
+  /* Now the real parallel work:
+   * Each thread will start immediately with the first chunk.
+   */
+  #pragma omp parallel shared(tids,maxiter)
+  {  /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(guided)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      #pragma omp flush(maxiter,notout)
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+#ifdef VERBOSE
+        printf(".");
+#endif
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of for */
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /*******************************************************
+   * evaluation of the values              *
+   *******************************************************/
+  {
+    int determined_chunksize = 1;
+    int last_threadnr = tids[0];
+    int global_chunknr = 0;
+    int openwork = CFSMAX_SIZE;
+    int expected_chunk_size;
+    int* local_chunknr = (int*)malloc(threads * sizeof(int));
+    double c = 1;
+
+    for (i = 0; i < threads; i++)
+      local_chunknr[i] = 0;
+
+    tids[CFSMAX_SIZE] = -1;
+
+    /*
+     * determine the number of global chunks
+     */
+    // fprintf(stderr,"# global_chunknr thread local_chunknr chunksize\n");
+    for(i = 1; i <= CFSMAX_SIZE; ++i) {
+      if (last_threadnr==tids[i]) {
+        determined_chunksize++;
+      } else {
+        /* fprintf(stderr, "%d\t%d\t%d\t%d\n", global_chunknr,
+           last_threadnr, local_chunknr[last_threadnr], m); */
+        global_chunknr++;
+        local_chunknr[last_threadnr]++;
+        last_threadnr = tids[i];
+        determined_chunksize = 1;
+      }
+    }
+    /* now allocate the memory for saving the sizes of the global chunks */
+    chunksizes = (int*)malloc(global_chunknr * sizeof(int));
+
+    /*
+    * Evaluate the sizes of the global chunks
+    */
+    global_chunknr = 0;
+    determined_chunksize = 1;
+    last_threadnr = tids[0];
+    for (i = 1; i <= CFSMAX_SIZE; ++i) {
+      /* If the threadnumber was the same as before increase the
+       * detected chunksize for this chunk otherwise set the detected
+       * chunksize again to one and save the number of the next
+       * thread in last_threadnr.
+       */
+      if (last_threadnr == tids[i]) {
+        determined_chunksize++;
+      } else {
+        chunksizes[global_chunknr] = determined_chunksize;
+        global_chunknr++;
+        local_chunknr[last_threadnr]++;
+        last_threadnr = tids[i];
+        determined_chunksize = 1;
+      }
+    }
+
+#ifdef VERBOSE
+    fprintf(stderr, "found\texpected\tconstant\n");
+#endif
+
+    /* identify the constant c for the exponential
+       decrease of the chunksize */
+    expected_chunk_size = openwork / threads;
+    c = (double) chunksizes[0] / expected_chunk_size;
+
+    for (i = 0; i < global_chunknr; i++) {
+      /* calculate the new expected chunksize */
+      if (expected_chunk_size > 1)
+        expected_chunk_size = c * openwork / threads;
+#ifdef VERBOSE
+      fprintf(stderr, "%8d\t%8d\t%lf\n", chunksizes[i],
+        expected_chunk_size, c * chunksizes[i]/expected_chunk_size);
+#endif
+      /* check if chunksize is inside the rounding errors */
+      if (abs (chunksizes[i] - expected_chunk_size) >= 2) {
+        result = 0;
+#ifndef VERBOSE
+        fprintf(stderr, "Chunksize differed from expected "
+          "value: %d instead of %d\n", chunksizes[i],
+          expected_chunk_size);
+        return 0;
+#endif
+      } /* end if */
+
+#ifndef VERBOSE
+      if (expected_chunk_size - chunksizes[i] < 0)
+        fprintf(stderr, "Chunksize did not decrease: %d"
+          " instead of %d\n", chunksizes[i],expected_chunk_size);
+#endif
+
+      /* calculating the remaining amount of work */
+      openwork -= chunksizes[i];
+    }
+  }
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_guided()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c b/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c
new file mode 100644
index 0000000..b957fc3
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_runtime.c
@@ -0,0 +1,82 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=static %libomp-run 1 0
+// RUN: env OMP_SCHEDULE=static,10 %libomp-run 1 10
+// RUN: env OMP_SCHEDULE=dynamic %libomp-run 2 1
+// RUN: env OMP_SCHEDULE=dynamic,11 %libomp-run 2 11
+// RUN: env OMP_SCHEDULE=guided %libomp-run 3 1
+// RUN: env OMP_SCHEDULE=guided,12 %libomp-run 3 12
+// RUN: env OMP_SCHEDULE=auto %libomp-run 4 1
+// RUN: env OMP_SCHEDULE=trapezoidal %libomp-run 101 1
+// RUN: env OMP_SCHEDULE=trapezoidal,13 %libomp-run 101 13
+// RUN: env OMP_SCHEDULE=static_steal %libomp-run 102 1
+// RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 102 14
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int sum;
+char* correct_kind_string;
+omp_sched_t correct_kind;
+int correct_chunk_size;
+
+int test_omp_for_runtime()
+{
+  int sum;
+  int known_sum;
+  int chunk_size;
+  int error;
+  omp_sched_t kind;
+
+  sum = 0;
+  error = 0;
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  omp_get_schedule(&kind, &chunk_size);
+
+  printf("omp_get_schedule() returns: Schedule = %d, Chunk Size = %d\n",
+         kind, chunk_size);
+  if (kind != correct_kind) {
+    printf("kind(%d) != correct_kind(%d)\n", kind, correct_kind);
+    error = 1;
+  }
+  if (chunk_size != correct_chunk_size) {
+    printf("chunk_size(%d) != correct_chunk_size(%d)\n", chunk_size,
+           correct_chunk_size);
+    error = 1;
+  }
+
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp for schedule(runtime)
+    for (i = 1; i <= LOOPCOUNT; i++) {
+        #pragma omp critical
+        sum+=i;
+    }
+  }
+  if (known_sum != sum) {
+    printf("Known Sum = %d, Calculated Sum = %d\n", known_sum, sum);
+    error = 1;
+  }
+  return !error;
+}
+
+int main(int argc, char** argv)
+{
+  int i;
+  int num_failed=0;
+  if (argc != 3) {
+    fprintf(stderr, "usage: %s schedule_kind chunk_size\n", argv[0]);
+    fprintf(stderr, "  Run with envirable OMP_SCHEDULE=kind[,chunk_size]\n");
+    return 1;
+  }
+  correct_kind = atoi(argv[1]);
+  correct_chunk_size = atoi(argv[2]);
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_for_runtime()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_static.c b/final/runtime/test/worksharing/for/omp_for_schedule_static.c
new file mode 100644
index 0000000..f46a544
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_static.c
@@ -0,0 +1,154 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+int test_omp_for_schedule_static()
+{
+  int threads;
+  int i,lasttid;
+  int * tids;
+  int notout;
+  int maxiter;
+  int chunk_size;
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+
+  chunk_size = 7;
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+  #pragma omp parallel shared(tids,counter)
+  {  /* begin of parallel*/
+    #pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }  /* end of single */
+  }  /* end of parallel */
+
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+  fprintf (stderr,"Using an internal count of %d\nUsing a specified"
+    " chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+  tids[CFSMAX_SIZE] = -1;  /* setting endflag */
+  #pragma omp parallel shared(tids)
+  { /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(static,chunk_size)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+        printf(".");
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of for */
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /**** analysing the data in array tids ****/
+
+  lasttid = tids[0];
+  tmp_count = 0;
+
+  for (i = 0; i < CFSMAX_SIZE + 1; ++i) {
+    /* If the work  was done by the same thread increase tmp_count by one. */
+    if (tids[i] == lasttid) {
+      tmp_count++;
+#ifdef VERBOSE
+      fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+      continue;
+    }
+
+    /* Check if the next thread had has the right thread number. When finding
+     * threadnumber -1 the end should be reached.
+     */
+    if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+      /* checking for the right chunk size */
+      if (tmp_count == chunk_size) {
+        tmp_count = 1;
+        lasttid = tids[i];
+#ifdef VERBOSE
+        fprintf (stderr, "OK\n");
+#endif
+      } else {
+        /* If the chunk size was wrong, check if the end was reached */
+        if (tids[i] == -1) {
+          if (i == CFSMAX_SIZE) {
+            fprintf (stderr, "Last thread had chunk size %d\n",
+              tmp_count);
+            break;
+          } else {
+            fprintf (stderr, "ERROR: Last thread (thread with"
+              " number -1) was found before the end.\n");
+            result = 0;
+          }
+        } else {
+          fprintf (stderr, "ERROR: chunk size was %d. (assigned"
+            " was %d)\n", tmp_count, chunk_size);
+          result = 0;
+        }
+      }
+    } else {
+      fprintf(stderr, "ERROR: Found thread with number %d (should be"
+        " inbetween 0 and %d).", tids[i], threads - 1);
+      result = 0;
+    }
+#ifdef VERBOSE
+    fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+  }
+
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_static()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c b/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c
new file mode 100644
index 0000000..922f27a
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_for_schedule_static_3.c
@@ -0,0 +1,202 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+#include "omp_my_sleep.h"
+
+#define CFSMAX_SIZE 1000
+#define MAX_TIME 0.01
+
+#ifdef SLEEPTIME
+#undef SLEEPTIME
+#define SLEEPTIME 0.0005
+#endif
+
+#define VERBOSE 0
+
+int test_omp_for_schedule_static_3()
+{
+  int threads;
+  int i,lasttid;
+
+  int * tids;
+  int * tids2;
+  int notout;
+  int maxiter;
+  int chunk_size;
+
+  int counter = 0;
+  int tmp_count=1;
+  int lastthreadsstarttid = -1;
+  int result = 1;
+  chunk_size = 7;
+
+  tids = (int *) malloc (sizeof (int) * (CFSMAX_SIZE + 1));
+  notout = 1;
+  maxiter = 0;
+
+  #pragma omp parallel shared(tids,counter)
+  {  /* begin of parallel*/
+    #pragma omp single
+    {
+      threads = omp_get_num_threads ();
+    }  /* end of single */
+  }  /* end of parallel */
+
+  /* Ensure that at least two threads are created */
+  if (threads < 2) {
+    omp_set_num_threads(2);
+    threads = 2;
+  }
+  fprintf (stderr,"Using an internal count of %d\nUsing a"
+    " specified chunksize of %d\n", CFSMAX_SIZE, chunk_size);
+  tids[CFSMAX_SIZE] = -1;  /* setting endflag */
+
+  #pragma omp parallel shared(tids)
+  {  /* begin of parallel */
+    double count;
+    int tid;
+    int j;
+
+    tid = omp_get_thread_num ();
+
+    #pragma omp for nowait schedule(static,chunk_size)
+    for(j = 0; j < CFSMAX_SIZE; ++j) {
+      count = 0.;
+      #pragma omp flush(maxiter)
+      if (j > maxiter) {
+        #pragma omp critical
+        {
+          maxiter = j;
+        }
+      }
+      /*printf ("thread %d sleeping\n", tid);*/
+      while (notout && (count < MAX_TIME) && (maxiter == j)) {
+        #pragma omp flush(maxiter,notout)
+        my_sleep (SLEEPTIME);
+        count += SLEEPTIME;
+        printf(".");
+      }
+#ifdef VERBOSE
+      if (count > 0.) printf(" waited %lf s\n", count);
+#endif
+      /*printf ("thread %d awake\n", tid);*/
+      tids[j] = tid;
+#ifdef VERBOSE
+      printf("%d finished by %d\n",j,tid);
+#endif
+    } /* end of omp parallel for */
+
+    notout = 0;
+    #pragma omp flush(maxiter,notout)
+  } /* end of parallel */
+
+  /**** analysing the data in array tids ****/
+
+  lasttid = tids[0];
+  tmp_count = 0;
+
+  for (i = 0; i < CFSMAX_SIZE + 1; ++i) {
+    /* If the work  was done by the same thread
+       increase tmp_count by one. */
+    if (tids[i] == lasttid) {
+      tmp_count++;
+#ifdef VERBOSE
+      fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+      continue;
+    }
+
+    /* Check if the next thread had has the right thread number.
+     * When finding threadnumber -1 the end should be reached.
+     */
+    if (tids[i] == (lasttid + 1) % threads || tids[i] == -1) {
+      /* checking for the right chunk size */
+      if (tmp_count == chunk_size) {
+        tmp_count = 1;
+        lasttid = tids[i];
+#ifdef VERBOSE
+        fprintf (stderr, "OK\n");
+#endif
+      } else {
+        /* If the chunk size was wrong, check if the end was reached */
+        if (tids[i] == -1) {
+          if (i == CFSMAX_SIZE) {
+            fprintf (stderr, "Last thread had chunk size %d\n",
+              tmp_count);
+            break;
+          } else {
+            fprintf (stderr, "ERROR: Last thread (thread with"
+              " number -1) was found before the end.\n");
+            result = 0;
+          }
+        } else {
+          fprintf (stderr, "ERROR: chunk size was %d. (assigned"
+            " was %d)\n", tmp_count, chunk_size);
+          result = 0;
+        }
+      }
+    } else {
+      fprintf(stderr, "ERROR: Found thread with number %d (should be"
+        " inbetween 0 and %d).", tids[i], threads - 1);
+      result = 0;
+    }
+#ifdef VERBOSE
+    fprintf (stderr, "%d: %d \n", i, tids[i]);
+#endif
+  }
+
+  /* Now we check if several loop regions in one parallel region have the
+   * same logical assignement of chunks to threads. We use the nowait
+   * clause to increase the probability to get an error. */
+
+  /* First we allocate some more memmory */
+  free (tids);
+  tids = (int *) malloc (sizeof (int) * LOOPCOUNT);
+  tids2 = (int *) malloc (sizeof (int) * LOOPCOUNT);
+
+  #pragma omp parallel
+  {
+    {
+      int n;
+      #pragma omp for schedule(static) nowait
+      for (n = 0; n < LOOPCOUNT; n++) {
+        if (LOOPCOUNT == n + 1 )
+          my_sleep(SLEEPTIME);
+
+        tids[n] = omp_get_thread_num();
+      }
+    }
+    {
+      int m;
+      #pragma omp for schedule(static) nowait
+      for (m = 1; m <= LOOPCOUNT; m++) {
+        tids2[m-1] = omp_get_thread_num();
+      }
+    }
+  }
+
+  for (i = 0; i < LOOPCOUNT; i++)
+  if (tids[i] != tids2[i]) {
+    fprintf (stderr, "Chunk no. %d was assigned once to thread %d and"
+      " later to thread %d.\n", i, tids[i],tids2[i]);
+    result = 0;
+  }
+
+  free (tids);
+  free (tids2);
+  return result;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_for_schedule_static_3()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_monotonic_env.c b/final/runtime/test/worksharing/for/omp_monotonic_env.c
new file mode 100644
index 0000000..c8cfd2a
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_monotonic_env.c
@@ -0,0 +1,86 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=monotonic:dynamic,50 %libomp-run monotonic dynamic 50
+// RUN: env OMP_SCHEDULE=monotonic:guided,51 %libomp-run monotonic guided 51
+// RUN: env OMP_SCHEDULE=monotonic:static,52 %libomp-run monotonic static 52
+// RUN: env OMP_SCHEDULE=nonmonotonic:dynamic,53 %libomp-run nonmonotonic dynamic 53
+// RUN: env OMP_SCHEDULE=nonmonotonic:guided,54 %libomp-run nonmonotonic guided 54
+
+// The test checks OMP 5.0 monotonic/nonmonotonic OMP_SCHEDULE parsing
+// The nonmonotonic tests see if the parser accepts nonmonotonic, if the
+// parser doesn't then a static schedule is assumed
+
+#include <stdio.h>
+#include <string.h>
+#include <omp.h>
+
+int err = 0;
+
+omp_sched_t sched_without_modifiers(omp_sched_t sched) {
+  return (omp_sched_t)((int)sched & ~((int)omp_sched_monotonic));
+}
+
+int sched_has_modifiers(omp_sched_t sched, omp_sched_t modifiers) {
+  return (int)sched & (int)modifiers;
+}
+
+// check that sched = hope | modifiers
+void check_schedule(const char *extra, const omp_sched_t sched, int chunk,
+                    omp_sched_t hope_sched, int hope_chunk) {
+
+  if (sched != hope_sched || chunk != hope_chunk) {
+    ++err;
+    printf("Error: %s: schedule: (%d, %d) is not equal to (%d, %d)\n", extra,
+           (int)hope_sched, hope_chunk, (int)sched, chunk);
+  }
+}
+
+omp_sched_t str2omp_sched(const char *str) {
+  if (!strcmp(str, "dynamic"))
+    return omp_sched_dynamic;
+  if (!strcmp(str, "static"))
+    return omp_sched_static;
+  if (!strcmp(str, "guided"))
+    return omp_sched_guided;
+  printf("Error: Unknown schedule type: %s\n", str);
+  exit(1);
+}
+
+int is_monotonic(const char *str) { return !strcmp(str, "monotonic"); }
+
+int main(int argc, char **argv) {
+  int i, monotonic, chunk, ref_chunk;
+  omp_sched_t sched, ref_sched;
+
+  if (argc != 4) {
+    printf("Error: usage: <executable> monotonic|nonmonotonic <schedule> "
+           "<chunk-size>\n");
+    exit(1);
+  }
+
+  monotonic = is_monotonic(argv[1]);
+  ref_sched = str2omp_sched(argv[2]);
+  ref_chunk = atoi(argv[3]);
+
+  omp_get_schedule(&sched, &chunk);
+
+  if (monotonic && !sched_has_modifiers(sched, omp_sched_monotonic)) {
+    printf("Error: sched (0x%x) does not have monotonic modifier\n",
+           (int)sched);
+    ++err;
+  }
+  sched = sched_without_modifiers(sched);
+  if (sched != ref_sched) {
+    printf("Error: sched (0x%x) is not 0x%x\n", (int)sched, (int)ref_sched);
+    ++err;
+  }
+  if (chunk != ref_chunk) {
+    printf("Error: chunk is not %d\n", ref_chunk);
+    ++err;
+  }
+  if (err > 0) {
+    printf("Failed\n");
+    return 1;
+  }
+  printf("Passed\n");
+  return 0;
+}
diff --git a/final/runtime/test/worksharing/for/omp_monotonic_schedule_set_get.c b/final/runtime/test/worksharing/for/omp_monotonic_schedule_set_get.c
new file mode 100644
index 0000000..94896eb
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_monotonic_schedule_set_get.c
@@ -0,0 +1,134 @@
+// RUN: %libomp-compile-and-run
+
+// The test checks OMP 5.0 monotonic/nonmonotonic scheduling API
+//   1. initial schedule should be (static,0)
+//   2. omp_get_schedule() should return the schedule set by omp_set_schedule()
+//   3. schedules set inside parallel should not impact outer tasks' schedules
+
+#include <stdio.h>
+#ifndef __INTEL_COMPILER
+#define _OMPIMP
+#endif
+
+#define NO_MODIFIERS ((omp_sched_t)0)
+
+#include "omp.h"
+
+int global = 0;
+int err = 0;
+
+omp_sched_t sched_append_modifiers(omp_sched_t sched, omp_sched_t modifiers) {
+  return (omp_sched_t)((int)sched | (int)modifiers);
+}
+
+omp_sched_t sched_without_modifiers(omp_sched_t sched) {
+  return (omp_sched_t)((int)sched & ~((int)omp_sched_monotonic));
+}
+
+int sched_has_modifiers(omp_sched_t sched, omp_sched_t modifiers) {
+  return (((int)sched & ((int)omp_sched_monotonic)) > 0);
+}
+
+// check that sched = hope | modifiers
+void check_schedule(const char *extra, const omp_sched_t sched, int chunk,
+                    omp_sched_t hope_sched, int hope_chunk) {
+
+  if (sched != hope_sched || chunk != hope_chunk) {
+#pragma omp atomic
+    ++err;
+    printf("Error: %s: schedule: (%d, %d) is not equal to (%d, %d)\n", extra,
+           (int)hope_sched, hope_chunk, (int)sched, chunk);
+  }
+}
+
+int main() {
+  int i;
+  int chunk;
+  omp_sched_t sched0;
+
+  omp_set_dynamic(0);
+  omp_set_nested(1);
+
+  // check serial region
+  omp_get_schedule(&sched0, &chunk);
+#ifdef DEBUG
+  printf("initial: (%d, %d)\n", sched0, chunk);
+#endif
+  check_schedule("initial", omp_sched_static, 0, sched0, chunk);
+  // set schedule before the parallel, check it after the parallel
+  omp_set_schedule(
+      sched_append_modifiers(omp_sched_dynamic, omp_sched_monotonic), 3);
+
+#pragma omp parallel num_threads(3) private(i)
+  {
+    omp_sched_t n_outer_set, n_outer_get;
+    int c_outer;
+    int tid = omp_get_thread_num();
+
+    n_outer_set = sched_append_modifiers((omp_sched_t)(tid + 1),
+                                         omp_sched_monotonic); // 1, 2, 3
+
+    // check outer parallel region
+    // master sets (static, unchunked), others - (dynamic, 1), (guided, 2)
+    // set schedule before inner parallel, check it after the parallel
+    omp_set_schedule(n_outer_set, tid);
+
+// Make sure this schedule doesn't crash the runtime
+#pragma omp for
+    for (i = 0; i < 100; ++i) {
+#pragma omp atomic
+      global++;
+    }
+
+#pragma omp parallel num_threads(3) private(i) shared(n_outer_set)
+    {
+      omp_sched_t n_inner_set, n_inner_get;
+      int c_inner_set, c_inner_get;
+      int tid = omp_get_thread_num();
+
+      n_inner_set = (omp_sched_t)(tid + 1); // 1, 2, 3
+      c_inner_set = (int)(n_outer_set)*10 +
+                    (int)n_inner_set; // 11, 12, 13, 21, 22, 23, 31, 32, 33
+      n_inner_set = sched_append_modifiers(n_inner_set, omp_sched_monotonic);
+      // schedules set inside parallel should not impact outer schedules
+      omp_set_schedule(n_inner_set, c_inner_set);
+
+// Make sure this schedule doesn't crash the runtime
+#pragma omp for
+      for (i = 0; i < 100; ++i) {
+#pragma omp atomic
+        global++;
+      }
+
+#pragma omp barrier
+      omp_get_schedule(&n_inner_get, &c_inner_get);
+#ifdef DEBUG
+      printf("inner parallel: o_th %d, i_th %d, (%d, %d)\n", n_outer_set - 1,
+             tid, n_inner_get, c_inner_get);
+#endif
+      check_schedule("inner", n_inner_set, c_inner_set, n_inner_get,
+                     c_inner_get);
+    }
+
+    omp_get_schedule(&n_outer_get, &c_outer);
+#ifdef DEBUG
+    printf("outer parallel: thread %d, (%d, %d)\n", tid, n_outer_get, c_outer);
+#endif
+    check_schedule("outer", n_outer_set, tid, n_outer_get, c_outer);
+  }
+
+  omp_get_schedule(&sched0, &chunk);
+#ifdef DEBUG
+  printf("after parallels: (%d, %d)\n", sched0, chunk);
+#endif
+  check_schedule("after parallels",
+                 sched_append_modifiers(omp_sched_dynamic, omp_sched_monotonic),
+                 3, sched0, chunk);
+
+  if (err > 0) {
+    printf("Failed\n");
+    return 1;
+  }
+  printf("Passed\n");
+  return 0;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c b/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c
new file mode 100644
index 0000000..3b3bf7d
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_firstprivate.c
@@ -0,0 +1,35 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_firstprivate()
+{
+  int sum ;
+  int i2;
+  int i;
+  int known_sum;
+
+  sum=0;
+  i2=3;
+
+  #pragma omp parallel for reduction(+:sum) private(i) firstprivate(i2)
+  for (i = 1; i <= LOOPCOUNT; i++) {
+    sum = sum + (i + i2);
+  }
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2 + i2 * LOOPCOUNT;
+  return (known_sum == sum);
+} /* end of check_parallel_for_fistprivate */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_if.c b/final/runtime/test/worksharing/for/omp_parallel_for_if.c
new file mode 100644
index 0000000..57fe498
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_if.c
@@ -0,0 +1,42 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_if()
+{
+  int known_sum;
+  int num_threads;
+  int sum, sum2;
+  int i;
+  int control;
+
+  control = 0;
+  num_threads=0;
+  sum = 0;
+  sum2 = 0;
+
+  #pragma omp parallel for private(i) if (control==1)
+  for (i=0; i <= LOOPCOUNT; i++) {
+    num_threads = omp_get_num_threads();
+    sum = sum + i;
+  }
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  fprintf(stderr, "Number of threads determined by"
+    " omp_get_num_threads: %d\n", num_threads);
+  return (known_sum == sum && num_threads == 1);
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_if()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c b/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c
new file mode 100644
index 0000000..a53cfb2
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_lastprivate.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_for_lastprivate()
+{
+  int sum;
+  int i;
+  int i0;
+  int known_sum;
+
+  sum =0;
+  i0 = -1;
+
+  #pragma omp parallel for reduction(+:sum) \
+    schedule(static,7) private(i) lastprivate(i0)
+  for (i = 1; i <= LOOPCOUNT; i++) {
+    sum = sum + i;
+    i0 = i;
+  } /* end of parallel for */
+
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return ((known_sum == sum) && (i0 == LOOPCOUNT));
+} /* end of check_parallel_for_lastprivate */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c b/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c
new file mode 100644
index 0000000..5fef460
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_ordered.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+static int last_i = 0;
+
+int i;
+#pragma omp threadprivate(i)
+
+/* Variable ii is used to avoid problems with a threadprivate variable used as a loop
+ * index. See test omp_threadprivate_for.
+ */
+static int ii;
+#pragma omp threadprivate(ii)
+
+/*!
+  Utility function: returns true if the passed argument is larger than
+  the argument of the last call of this function.
+ */
+static int check_i_islarger2(int i)
+{
+  int islarger;
+  islarger = (i > last_i);
+  last_i = i;
+  return (islarger);
+}
+
+int test_omp_parallel_for_ordered()
+{
+  int sum;
+  int is_larger;
+  int known_sum;
+  int i;
+
+  sum = 0;
+  is_larger = 1;
+  last_i = 0;
+  #pragma omp parallel for schedule(static,1) private(i) ordered
+  for (i = 1; i < 100; i++) {
+    ii = i;
+    #pragma omp ordered
+    {
+      is_larger = check_i_islarger2 (ii) && is_larger;
+      sum  = sum + ii;
+    }
+  }
+  known_sum = (99 * 100) / 2;
+  fprintf (stderr," known_sum = %d , sum = %d \n", known_sum, sum);
+  fprintf (stderr," is_larger = %d\n", is_larger);
+  return (known_sum == sum) && is_larger;
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_ordered()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_private.c b/final/runtime/test/worksharing/for/omp_parallel_for_private.c
new file mode 100644
index 0000000..1231d36
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_private.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+/*! Utility function to spend some time in a loop */
+static void do_some_work (void)
+{
+  int i;
+  double sum = 0;
+  for(i = 0; i < 1000; i++){
+    sum += sqrt (i);
+  }
+}
+
+int test_omp_parallel_for_private()
+{
+  int sum;
+  int i;
+  int i2;
+  int known_sum;
+
+  sum =0;
+  i2=0;
+
+  #pragma omp parallel for reduction(+:sum) schedule(static,1) private(i) private(i2)
+  for (i=1;i<=LOOPCOUNT;i++)
+  {
+    i2 = i;
+    #pragma omp flush
+    do_some_work ();
+    #pragma omp flush
+    sum = sum + i2;
+  } /*end of for*/
+  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+  return (known_sum == sum);
+} /* end of check_parallel_for_private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c b/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c
new file mode 100644
index 0000000..118d730
--- /dev/null
+++ b/final/runtime/test/worksharing/for/omp_parallel_for_reduction.c
@@ -0,0 +1,266 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define DOUBLE_DIGITS 20    /* dt^DOUBLE_DIGITS */
+#define MAX_FACTOR 10
+#define KNOWN_PRODUCT 3628800  /* 10! */
+
+int test_omp_parallel_for_reduction()
+{
+  int sum;
+  int known_sum;
+  double dsum;
+  double dknown_sum;
+  double dt=0.5;  /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-9;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int logic_or;
+  int bit_and;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[LOOPCOUNT];
+  int i;
+  double dpt;
+  int result;
+
+  sum =0;
+  dsum=0;
+  dt = 1./3.;
+  result = 0;
+  product = 1;
+  logic_and=1;
+  logic_or=0;
+  bit_and=1;
+  bit_or=0;
+  exclusiv_bit_or=0;
+
+  /* Tests for integers */
+  known_sum = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:sum)
+  for (i=1;i<=LOOPCOUNT;i++) {
+    sum=sum+i;
+  }
+  if(known_sum!=sum) {
+    result++;
+    fprintf(stderr,"Error in sum with integers: Result was %d"
+      " instead of %d\n",sum,known_sum);
+  }
+
+  diff = (LOOPCOUNT*(LOOPCOUNT+1))/2;
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:diff)
+  for (i=1;i<=LOOPCOUNT;++i) {
+    diff=diff-i;
+  }
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in difference with integers: Result was %d"
+      " instead of 0.\n",diff);
+  }
+
+  /* Tests for doubles */
+  dsum=0;
+  dpt=1;
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(+:dsum)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dsum += pow(dt,i);
+  }
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f"
+      " instead of %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  dpt=1;
+
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(-:ddiff)
+  for (i=0;i<DOUBLE_DIGITS;++i) {
+    ddiff -= pow(dt,i);
+  }
+  if( fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n",ddiff);
+  }
+
+  /* Tests for integers */
+  #pragma omp parallel for schedule(dynamic,1) private(i) reduction(*:product)
+  for(i=1;i<=MAX_FACTOR;i++) {
+    product *= i;
+  }
+  known_product = KNOWN_PRODUCT;
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n\n",product,known_product);
+  }
+
+  /* Tests for logic AND */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = (logic_and && logics[i]);
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1.\n");
+  }
+
+  logic_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&&:logic_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_and = logic_and && logics[i];
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2.\n");
+  }
+
+  /* Tests for logic OR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 1.\n");
+  }
+  logic_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(||:logic_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    logic_or = logic_or || logics[i];
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2.\n");
+  }
+
+  /* Tests for bitwise AND */
+  for(i=0;i<LOOPCOUNT;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = (bit_and & logics[i]);
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1.\n");
+  }
+
+  bit_and = 1;
+  logics[LOOPCOUNT/2]=0;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(&:bit_and)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_and = bit_and & logics[i];
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2.\n");
+  }
+
+  /* Tests for bitwise OR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(|:bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    bit_or = bit_or | logics[i];
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  /* Tests for bitwise XOR */
+  for(i=0;i<LOOPCOUNT;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[LOOPCOUNT/2]=1;
+
+  #pragma omp parallel for schedule(dynamic,1) private(i) \
+    reduction(^:exclusiv_bit_or)
+  for(i=0;i<LOOPCOUNT;++i) {
+    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_for_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_firstprivate.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_firstprivate.c
new file mode 100644
index 0000000..1780fab
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_firstprivate.c
@@ -0,0 +1,54 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+
+  sum =7;
+  sum0=11;
+
+  #pragma omp parallel sections firstprivate(sum0)
+  {
+    #pragma omp section
+    {
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+  }
+
+  known_sum=11*3+7;
+  return (known_sum==sum);
+} /* end of check_section_firstprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_lastprivate.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_lastprivate.c
new file mode 100644
index 0000000..9b775ec
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_lastprivate.c
@@ -0,0 +1,71 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_lastprivate()
+{
+  int sum;
+  int sum0;
+  int i;
+  int i0;
+  int known_sum;
+  sum =0;
+  sum0 = 0;
+  i0 = -1;
+
+  #pragma omp parallel sections private(i,sum0) lastprivate(i0)
+  {
+    #pragma omp section
+    {
+      sum0=0;
+      for (i=1;i<400;i++) {
+        sum0=sum0+i;
+        i0=i;
+      }
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=400;i<700;i++) {
+        sum0=sum0+i;
+        i0=i;
+      }
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=700;i<1000;i++) {
+        sum0=sum0+i;
+        i0=i;
+      }
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+  }
+
+  known_sum=(999*1000)/2;
+  return ((known_sum==sum) && (i0==999) );
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_private.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_private.c
new file mode 100644
index 0000000..7dab295
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_private.c
@@ -0,0 +1,64 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_private()
+{
+  int sum;
+  int sum0;
+  int i;
+  int known_sum;
+
+  sum = 7;
+  sum0=0;
+
+  #pragma omp parallel sections private(sum0, i)
+  {
+    #pragma omp section
+    {
+      sum0=0;
+      for (i=1;i<400;i++)
+        sum0=sum0+i;
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=400;i<700;i++)
+        sum0=sum0+i;
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+    #pragma omp section
+    {
+      sum0=0;
+      for(i=700;i<1000;i++)
+        sum0=sum0+i;
+      #pragma omp critical
+      {
+        sum= sum+sum0;
+      }
+    }
+  }
+
+  known_sum=(999*1000)/2+7;
+  return (known_sum==sum);
+} /* end of check_section_private*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_parallel_sections_reduction.c b/final/runtime/test/worksharing/sections/omp_parallel_sections_reduction.c
new file mode 100644
index 0000000..0d49865
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_parallel_sections_reduction.c
@@ -0,0 +1,508 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_parallel_sections_reduction()
+{
+  int sum;
+  int known_sum;
+  double dpt;
+  double dsum;
+  double dknown_sum;
+  double dt=0.5; /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-5;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int bit_and;
+  int logic_or;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[1000];
+  int i;
+  int result;
+
+  sum = 7;
+  dsum=0;
+  product =1;
+  dpt = 1;
+  logic_and=1;
+  bit_and=1;
+  logic_or=0;
+  bit_or=0;
+  exclusiv_bit_or=0;
+  result =0;
+  /*  int my_islarger;*/
+  /*int is_larger=1;*/
+
+  // Test summation of integers
+  known_sum = (999*1000)/2+7;
+  #pragma omp parallel sections private(i) reduction(+:sum)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        sum=sum+i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        sum=sum+i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        sum=sum+i;
+      }
+    }
+  }
+  if(known_sum!=sum) {
+    result++;
+    fprintf(stderr,"Error in sum with integers: Result was %d"
+      " instead of %d.\n",sum, known_sum);
+  }
+
+  // Test differences of integers
+  diff = (999*1000)/2;
+  #pragma omp parallel sections private(i) reduction(-:diff)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        diff=diff-i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        diff=diff-i;
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        diff=diff-i;
+      }
+    }
+  }
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in Difference with integers: Result was %d"
+      " instead of 0.\n",diff);
+  }
+
+  // Test summation of doubles
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel sections private(i) reduction(+:dsum)
+  {
+    #pragma omp section
+    {
+      for (i=0;i<6;++i) {
+        dsum += pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=6;i<12;++i) {
+        dsum += pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=12;i<20;++i) {
+        dsum += pow(dt,i);
+      }
+    }
+  }
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f"
+      " instead of %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  // Test differences of doubles
+  dpt=1;
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel sections private(i) reduction(-:ddiff)
+  {
+    #pragma omp section
+    {
+      for (i=0;i<6;++i) {
+        ddiff -= pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=6;i<12;++i) {
+        ddiff -= pow(dt,i);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=12;i<20;++i) {
+        ddiff -= pow(dt,i);
+      }
+    }
+  }
+  if( fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n",ddiff);
+  }
+
+  // Test product of integers
+  known_product = 3628800;
+  #pragma omp parallel sections private(i) reduction(*:product)
+  {
+    #pragma omp section
+    {
+      for(i=1;i<3;i++) {
+        product *= i;
+      }
+    }
+    #pragma omp section
+    {
+      for(i=3;i<7;i++) {
+        product *= i;
+      }
+    }
+    #pragma omp section
+    {
+      for(i=7;i<11;i++) {
+        product *= i;
+      }
+    }
+  }
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n",product,known_product);
+  }
+
+  // Test logical AND
+  for(i=0;i<1000;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel sections private(i) reduction(&&:logic_and)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1\n");
+  }
+  logic_and = 1;
+  logics[501] = 0;
+
+  #pragma omp parallel sections private(i) reduction(&&:logic_and)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_and = (logic_and && logics[i]);
+      }
+    }
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2");
+  }
+
+  // Test logical OR
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel sections private(i) reduction(||:logic_or)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 1\n");
+  }
+
+  logic_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel sections private(i) reduction(||:logic_or)
+  {
+    #pragma omp section
+    {
+      for (i=1;i<300;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=300;i<700;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for (i=700;i<1000;i++) {
+        logic_or = (logic_or || logics[i]);
+      }
+    }
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2\n");
+  }
+
+  // Test bitwise AND
+  for(i=0;i<1000;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel sections private(i) reduction(&:bit_and)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_and = (bit_and & logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_and = (bit_and & logics[i]);
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_and = (bit_and & logics[i]);
+      }
+    }
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1\n");
+  }
+
+  bit_and = 1;
+  logics[501]=0;
+
+  #pragma omp parallel sections private(i) reduction(&:bit_and)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_and = bit_and & logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_and = bit_and & logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_and = bit_and & logics[i];
+      }
+    }
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2");
+  }
+
+  // Test bitwise OR
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel sections private(i) reduction(|:bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel sections private(i) reduction(|:bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        bit_or = bit_or | logics[i];
+      }
+    }
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  // Test bitwise XOR
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel sections private(i) reduction(^:exclusiv_bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel sections private(i) reduction(^:exclusiv_bit_or)
+  {
+    #pragma omp section
+    {
+      for(i=0;i<300;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=300;i<700;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+    #pragma omp section
+    {
+      for(i=700;i<1000;++i) {
+        exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+      }
+    }
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_parallel_sections_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_section_firstprivate.c b/final/runtime/test/worksharing/sections/omp_section_firstprivate.c
new file mode 100644
index 0000000..5526475
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_section_firstprivate.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_section_firstprivate()
+{
+  int sum;
+  int sum0;
+  int known_sum;
+
+  sum0 = 11;
+  sum = 7;
+  #pragma omp parallel
+  {
+    #pragma omp  sections firstprivate(sum0)
+    {
+      #pragma omp section
+      {
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+    }
+  }
+  known_sum = 11 * 3 + 7;
+  return (known_sum == sum);
+} /* end of check_section_firstprivate*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_section_firstprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_section_lastprivate.c b/final/runtime/test/worksharing/sections/omp_section_lastprivate.c
new file mode 100644
index 0000000..0dbbea9
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_section_lastprivate.c
@@ -0,0 +1,76 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_section_lastprivate()
+{
+  int i0 = -1;
+  int sum = 0;
+  int i;
+  int sum0 = 0;
+  int known_sum;
+
+  i0 = -1;
+  sum = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections lastprivate(i0) private(i,sum0)
+    {
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 1; i < 400; i++)
+        {
+          sum0 = sum0 + i;
+          i0 = i;
+        }
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        } /*end of critical*/
+      } /* end of section */
+      #pragma omp section
+      {
+        sum0 = 0;
+        for(i = 400; i < 700; i++)
+        {
+          sum0 = sum0 + i;
+          i0 = i;
+        }
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        } /*end of critical*/
+      }
+      #pragma omp section
+      {
+        sum0 = 0;
+        for(i = 700; i < 1000; i++)
+        {
+          sum0 = sum0 + i;
+          i0 = i;
+        }
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        } /*end of critical*/
+      } /* end of section */
+    } /* end of sections*/
+  } /* end of parallel*/
+  known_sum = (999 * 1000) / 2;
+  return ((known_sum == sum) && (i0 == 999) );
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_section_lastprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_section_private.c b/final/runtime/test/worksharing/sections/omp_section_private.c
new file mode 100644
index 0000000..bf2a30d
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_section_private.c
@@ -0,0 +1,66 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_section_private()
+{
+  int sum;
+  int sum0;
+  int i;
+  int known_sum;
+
+  sum = 7;
+  sum0 = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(sum0,i)
+    {
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 1; i < 400; i++)
+          sum0 = sum0 + i;
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 400; i < 700; i++)
+          sum0 = sum0 + i;
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+      #pragma omp section
+      {
+        sum0 = 0;
+        for (i = 700; i < 1000; i++)
+          sum0 = sum0 + i;
+        #pragma omp critical
+        {
+          sum = sum + sum0;
+        }
+      }
+    } /*end of sections*/
+  } /* end of parallel */
+  known_sum = (999 * 1000) / 2 + 7;
+  return (known_sum == sum);
+} /* end of check_section_private*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_section_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_sections_nowait.c b/final/runtime/test/worksharing/sections/omp_sections_nowait.c
new file mode 100644
index 0000000..caff254
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_sections_nowait.c
@@ -0,0 +1,104 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * This test will hang if the nowait is not working properly
+ *
+ * It relies on a thread skipping to the second sections construct to
+ * release the threads in the first sections construct
+ *
+ * Also, since scheduling of sections is implementation defined, it is
+ * necessary to have all four sections in the second sections construct
+ * release the threads since we can't guarantee which section a single thread
+ * will execute.
+ */
+volatile int release;
+volatile int count;
+
+void wait_for_release_then_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d enters first section"
+    " and waits.\n", rank);
+  while (release == 0);
+  #pragma omp atomic
+  count++;
+}
+
+void release_and_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d sets release to 1\n", rank);
+  release = 1;
+  #pragma omp flush(release)
+  #pragma omp atomic
+  count++;
+}
+
+int test_omp_sections_nowait()
+{
+  release = 0;
+  count = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    #pragma omp sections nowait
+    {
+      #pragma omp section
+      {
+        wait_for_release_then_increment(rank);
+      }
+      #pragma omp section
+      {
+        wait_for_release_then_increment(rank);
+      }
+      #pragma omp section
+      {
+        wait_for_release_then_increment(rank);
+      }
+      #pragma omp section
+      {
+        fprintf(stderr, "Thread nr %d enters first sections and goes "
+          "immediately to next sections construct to release.\n", rank);
+        #pragma omp atomic
+        count++;
+      }
+    }
+    /* Begin of second sections environment */
+    #pragma omp sections
+    {
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+      #pragma omp section
+      {
+        release_and_increment(rank);
+      }
+    }
+  }
+  // Check to make sure all eight sections were executed
+  return (count==8);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_sections_nowait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/sections/omp_sections_reduction.c b/final/runtime/test/worksharing/sections/omp_sections_reduction.c
new file mode 100644
index 0000000..1fdb5ec
--- /dev/null
+++ b/final/runtime/test/worksharing/sections/omp_sections_reduction.c
@@ -0,0 +1,543 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+int test_omp_sections_reduction()
+{
+  int sum;
+  int known_sum;
+  double dpt,dsum;
+  double dknown_sum;
+  double dt=0.5; /* base of geometric row for + and - test*/
+  double rounding_error= 1.E-9;
+  int diff;
+  double ddiff;
+  int product;
+  int known_product;
+  int logic_and;
+  int bit_and;
+  int logic_or;
+  int bit_or;
+  int exclusiv_bit_or;
+  int logics[1000];
+  int i;
+  int result;
+  /* int my_islarger; */
+  /*int is_larger=1;*/
+  sum =7;
+  dpt =1;
+  dsum=0;
+  product =1;
+  logic_and=1;
+  bit_and=1;
+  logic_or=0;
+  bit_or=0;
+  exclusiv_bit_or=0;
+  result = 0;
+  dt = 1./3.;
+
+  known_sum = (999*1000)/2+7;
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(+:sum)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          sum=sum+i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          sum=sum+i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          sum=sum+i;
+        }
+      }
+    }
+  }
+  if(known_sum!=sum) {
+    ++result;
+    fprintf(stderr,"Error in sum with integers: Result was %d"
+      " instead of %d\n", sum,known_sum);
+  }
+
+  diff = (999*1000)/2;
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(-:diff)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          diff=diff-i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          diff=diff-i;
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          diff=diff-i;
+        }
+      }
+    }
+  }
+  if(diff != 0) {
+    result++;
+    fprintf(stderr,"Error in Difference with integers: Result was %d"
+      " instead of 0.\n",diff);
+  }
+
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  dknown_sum = (1-dpt)/(1-dt);
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(+:dsum)
+    {
+      #pragma omp section
+      {
+        for (i=0;i<6;++i) {
+          dsum += pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=6;i<12;++i) {
+          dsum += pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=12;i<20;++i) {
+          dsum += pow(dt,i);
+        }
+      }
+    }
+  }
+  if( fabs(dsum-dknown_sum) > rounding_error ) {
+    result++;
+    fprintf(stderr,"Error in sum with doubles: Result was %f"
+      " instead of %f (Difference: %E)\n",
+      dsum, dknown_sum, dsum-dknown_sum);
+  }
+
+  dpt=1;
+  for (i=0;i<20;++i) {
+    dpt*=dt;
+  }
+  fprintf(stderr,"\n");
+  ddiff = (1-dpt)/(1-dt);
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(-:ddiff)
+    {
+      #pragma omp section
+      {
+        for (i=0;i<6;++i) {
+          ddiff -= pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=6;i<12;++i) {
+          ddiff -= pow(dt,i);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=12;i<20;++i) {
+          ddiff -= pow(dt,i);
+        }
+      }
+    }
+  }
+
+  if(fabs(ddiff) > rounding_error) {
+    result++;
+    fprintf(stderr,"Error in Difference with doubles: Result was %E"
+      " instead of 0.0\n",ddiff);
+  }
+
+  known_product = 3628800;
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(*:product)
+    {
+      #pragma omp section
+      {
+        for(i=1;i<3;i++) {
+          product *= i;
+        }
+      }
+      #pragma omp section
+      {
+        for(i=3;i<7;i++) {
+          product *= i;
+        }
+      }
+      #pragma omp section
+      {
+        for(i=7;i<11;i++) {
+          product *= i;
+        }
+      }
+    }
+  }
+  if(known_product != product) {
+    result++;
+    fprintf(stderr,"Error in Product with integers: Result was %d"
+      " instead of %d\n",product,known_product);
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&&:logic_and)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+    }
+  }
+  if(!logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 1\n");
+  }
+
+  logic_and = 1;
+  logics[501] = 0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&&:logic_and)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_and = (logic_and && logics[i]);
+        }
+      }
+    }
+  }
+  if(logic_and) {
+    result++;
+    fprintf(stderr,"Error in logic AND part 2\n");
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(||:logic_or)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+    }
+  }
+  if(logic_or) {
+    result++;
+    fprintf(stderr,"\nError in logic OR part 1\n");
+  }
+
+  logic_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(||:logic_or)
+    {
+      #pragma omp section
+      {
+        for (i=1;i<300;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=300;i<700;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for (i=700;i<1000;i++) {
+          logic_or = (logic_or || logics[i]);
+        }
+      }
+    }
+  }
+  if(!logic_or) {
+    result++;
+    fprintf(stderr,"Error in logic OR part 2\n");
+  }
+
+  for(i=0;i<1000;++i) {
+    logics[i]=1;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&:bit_and)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_and = (bit_and & logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_and = (bit_and & logics[i]);
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_and = (bit_and & logics[i]);
+        }
+      }
+    }
+  }
+  if(!bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 1\n");
+  }
+
+  bit_and = 1;
+  logics[501]=0;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(&:bit_and)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_and = bit_and & logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_and = bit_and & logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_and = bit_and & logics[i];
+        }
+      }
+    }
+  }
+  if(bit_and) {
+    result++;
+    fprintf(stderr,"Error in BIT AND part 2\n");
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(|:bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+    }
+  }
+  if(bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 1\n");
+  }
+  bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(|:bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          bit_or = bit_or | logics[i];
+        }
+      }
+    }
+  }
+  if(!bit_or) {
+    result++;
+    fprintf(stderr,"Error in BIT OR part 2\n");
+  }
+
+  for(i=0;i<1000;i++) {
+    logics[i]=0;
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(^:exclusiv_bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+    }
+  }
+  if(exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 1\n");
+  }
+
+  exclusiv_bit_or = 0;
+  logics[501]=1;
+
+  #pragma omp parallel
+  {
+    #pragma omp sections private(i) reduction(^:exclusiv_bit_or)
+    {
+      #pragma omp section
+      {
+        for(i=0;i<300;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=300;i<700;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+      #pragma omp section
+      {
+        for(i=700;i<1000;++i) {
+          exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+        }
+      }
+    }
+  }
+  if(!exclusiv_bit_or) {
+    result++;
+    fprintf(stderr,"Error in EXCLUSIV BIT OR part 2\n");
+  }
+
+  /*printf("\nResult:%d\n",result);*/
+  return (result==0);
+}
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_sections_reduction()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single.c b/final/runtime/test/worksharing/single/omp_single.c
new file mode 100644
index 0000000..4963579
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single.c
@@ -0,0 +1,44 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int test_omp_single()
+{
+  int nr_threads_in_single;
+  int result;
+  int nr_iterations;
+  int i;
+
+  nr_threads_in_single = 0;
+  result = 0;
+  nr_iterations = 0;
+
+  #pragma omp parallel private(i)
+  {
+    for (i = 0; i < LOOPCOUNT; i++) {
+      #pragma omp single
+      {
+        #pragma omp flush
+        nr_threads_in_single++;
+        #pragma omp flush
+        nr_iterations++;
+        nr_threads_in_single--;
+        result = result + nr_threads_in_single;
+      }
+    }
+  }
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+} /* end of check_single*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single_copyprivate.c b/final/runtime/test/worksharing/single/omp_single_copyprivate.c
new file mode 100644
index 0000000..2fece5c
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single_copyprivate.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+#include "omp_testsuite.h"
+
+#define DEBUG_TEST 0
+
+int j;
+#pragma omp threadprivate(j)
+
+int test_omp_single_copyprivate()
+{
+  int result;
+  int nr_iterations;
+
+  result = 0;
+  nr_iterations = 0;
+  #pragma omp parallel num_threads(4)
+  {
+    int i;
+    for (i = 0; i < LOOPCOUNT; i++)
+    {
+#if DEBUG_TEST
+         int thread;
+         thread = omp_get_thread_num ();
+#endif
+      #pragma omp single copyprivate(j)
+      {
+        nr_iterations++;
+        j = i;
+#if DEBUG_TEST
+        printf ("thread %d assigns, j = %d, i = %d\n", thread, j, i);
+#endif
+      }
+#if DEBUG_TEST
+      #pragma omp barrier
+#endif
+      #pragma omp critical
+      {
+#if DEBUG_TEST
+        printf ("thread = %d, j = %d, i = %d\n", thread, j, i);
+#endif
+        result = result + j - i;
+      }
+      #pragma omp barrier
+    } /* end of for */
+  } /* end of parallel */
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_copyprivate()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single_nowait.c b/final/runtime/test/worksharing/single/omp_single_nowait.c
new file mode 100644
index 0000000..22f8930
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single_nowait.c
@@ -0,0 +1,73 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+/*
+ * This test will hang if the nowait is not working properly
+ *
+ * It relies on a one thread skipping to the last single construct to
+ * release the threads in the first three single constructs
+ */
+volatile int release;
+volatile int count;
+
+void wait_for_release_then_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d enters first section"
+    " and waits.\n", rank);
+  while (release == 0);
+  #pragma omp atomic
+  count++;
+}
+
+void release_and_increment(int rank)
+{
+  fprintf(stderr, "Thread nr %d sets release to 1\n", rank);
+  release = 1;
+  #pragma omp atomic
+  count++;
+}
+
+int test_omp_single_nowait()
+{
+  release = 0;
+  count = 0;
+
+  #pragma omp parallel num_threads(4)
+  {
+    int rank;
+    rank = omp_get_thread_num ();
+    #pragma omp single nowait
+    {
+      wait_for_release_then_increment(rank);
+    }
+    #pragma omp single nowait
+    {
+      wait_for_release_then_increment(rank);
+    }
+    #pragma omp single nowait
+    {
+      wait_for_release_then_increment(rank);
+    }
+
+    #pragma omp single
+    {
+      release_and_increment(rank);
+    }
+  }
+  // Check to make sure all four singles were executed
+  return (count==4);
+} /* end of check_single_nowait*/
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_nowait()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/test/worksharing/single/omp_single_private.c b/final/runtime/test/worksharing/single/omp_single_private.c
new file mode 100644
index 0000000..a27f8de
--- /dev/null
+++ b/final/runtime/test/worksharing/single/omp_single_private.c
@@ -0,0 +1,57 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+int myit = 0;
+#pragma omp threadprivate(myit)
+int myresult = 0;
+#pragma omp threadprivate(myresult)
+
+int test_omp_single_private()
+{
+  int nr_threads_in_single;
+  int result;
+  int nr_iterations;
+  int i;
+
+  myit = 0;
+  nr_threads_in_single = 0;
+  nr_iterations = 0;
+  result = 0;
+
+  #pragma omp parallel private(i)
+  {
+    myresult = 0;
+    myit = 0;
+    for (i = 0; i < LOOPCOUNT; i++) {
+      #pragma omp single private(nr_threads_in_single) nowait
+      {
+        nr_threads_in_single = 0;
+        #pragma omp flush
+        nr_threads_in_single++;
+        #pragma omp flush
+        myit++;
+        myresult = myresult + nr_threads_in_single;
+      }
+    }
+    #pragma omp critical
+    {
+      result += nr_threads_in_single;
+      nr_iterations += myit;
+    }
+  }
+  return ((result == 0) && (nr_iterations == LOOPCOUNT));
+} /* end of check_single private */
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for(i = 0; i < REPETITIONS; i++) {
+    if(!test_omp_single_private()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
diff --git a/final/runtime/tools/check-depends.pl b/final/runtime/tools/check-depends.pl
new file mode 100755
index 0000000..168c0cd
--- /dev/null
+++ b/final/runtime/tools/check-depends.pl
@@ -0,0 +1,505 @@
+#!/usr/bin/env perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.005";
+my $target_os;
+my $target_arch;
+
+# --------------------------------------------------------------------------------------------------
+# Ouput parse error.
+#     $tool -- Name of tool.
+#     @bulk -- Output of the tool.
+#     $n    -- Number of line caused parse error.
+sub parse_error($\@$) {
+    my ( $tool, $bulk, $n ) = @_;
+    my @bulk;
+    for ( my $i = 0; $i < @$bulk; ++ $i ) {
+        push( @bulk, ( $i == $n ? ">>> " : "    " ) . $bulk->[ $i ] );
+    }; # for $i
+    runtime_error( "Fail to parse $tool output:", @bulk, "(eof)" );
+}; # sub parse_error
+
+
+# --------------------------------------------------------------------------------------------------
+# Linux* OS version of get_deps() parses output of ldd:
+#
+# $ ldd libname.so
+#   libc.so.6 => /lib64/libc.so.6 (0x00002b60fedd8000)
+#   libdl.so.2 => /lib64/libdl.so.2 (0x00002b60ff12b000)
+#   libpthread.so.0 => /lib64/libpthread.so.0 (0x00002b60ff32f000)
+#   /lib64/ld-linux-x86-64.so.2 (0x0000003879400000)
+#
+# Note: ldd printd all the dependencies, direct and indirect. (For example, if specified library
+# requires libdl.so, and libdl.so requires /lib/ld-linux.so, ldd prints both libdl.so and
+# /lib/ld-linux.so). If you do not want indirect dependencies, look at readelf tool.
+#
+sub get_deps_ldd($) {
+
+    my $lib = shift ( @_ );
+    my $tool = "ldd";
+    my @bulk;
+    my @deps;
+
+    execute( [ $tool, $lib ], -stdout => \@bulk );
+    debug( @bulk, "(eof)" );
+
+    foreach my $i ( 0 .. @bulk - 1 ) {
+        my $line = $bulk[ $i ];
+        if ( $line !~ m{^\s*(?:([_a-z0-9.+-/]*)\s+=>\s+)?([_a-z0-9.+-/]*)\s+\(0x[0-9a-z]*\)$}i ) {
+            parse_error( $tool, @bulk, $i );
+        }; # if
+        my $dep = ( defined( $1 ) ? $1 : $2 );
+        push( @deps, $dep );
+    }; # foreach $i
+
+    return @deps;
+
+}; # sub get_deps_ldd
+
+
+# --------------------------------------------------------------------------------------------------
+# Another Linux* OS version of get_deps() parses output of readelf:
+#
+# $ readelf -d exports/lin_32e/lib/libomp.so
+#
+# Dynamic segment at offset 0x87008 contains 24 entries:
+#   Tag        Type                         Name/Value
+#  0x0000000000000001 (NEEDED)             Shared library: [libc.so.6]
+#  0x0000000000000001 (NEEDED)             Shared library: [libdl.so.2]
+#  0x0000000000000001 (NEEDED)             Shared library: [libpthread.so.0]
+#  0x000000000000000e (SONAME)             Library soname: [libomp.so]
+#  0x000000000000000d (FINI)               0x51caa
+#  0x0000000000000004 (HASH)               0x158
+#  0x0000000000000005 (STRTAB)             0x9350
+#  ...
+#
+# Note: In contrast to ldd, readlef shows only direct dependencies.
+#
+sub get_deps_readelf($) {
+
+    my $file = shift ( @_ );
+    my $tool;
+    my @bulk;
+    my @deps;
+
+    if($target_arch eq "mic") {
+        $tool = "x86_64-k1om-linux-readelf";
+    } else {
+        $tool = "readelf";
+    }
+
+    # Force the readelf call to be in English. For example, when readelf
+    # is called on a french localization, it will find "Librairie partagees"
+    # instead of shared library
+    $ENV{ LANG } = "C";
+
+    execute( [ $tool, "-d", $file ], -stdout => \@bulk );
+    debug( @bulk, "(eof)" );
+
+    my $i = 0;
+    # Parse header.
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    if ( $i == @bulk - 1 and $bulk[ $i ] =~ m{^There is no dynamic section in this file\.\s*$} ) {
+        # This is not dynamic executable => no dependencies.
+        return @deps;
+    }; # if
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Dynamic (?:segment|section) at offset 0x[0-9a-f]+ contains \d+ entries:\s*$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*Tag\s+Type\s+Name/Value\s*$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    # Parse body.
+    while ( $i < @bulk ) {
+        my $line = $bulk[ $i ];
+        if ( $line !~ m{^\s*0x[0-9a-f]+\s+\(([_A-Z0-9]+)\)\s+(.*)\s*$}i ) {
+            parse_error( $tool, @bulk, $i );
+        }; # if
+        my ( $type, $value ) = ( $1, $2 );
+        if ( $type eq "NEEDED" ) {
+            if ( $value !~ m{\AShared library: \[(.*)\]\z} ) {
+                parse_error( $tool, @bulk, $i );
+            }; # if
+            my $dep = $1;
+            push( @deps, $dep );
+        }; # if
+        ++ $i;
+    }; # foreach $i
+
+    return @deps;
+
+}; # sub get_deps_readelf
+
+
+# --------------------------------------------------------------------------------------------------
+# OS X* version of get_deps() parses output of otool:
+#
+# $ otool -L libname.dylib
+# exports/mac_32/lib.thin/libomp.dylib:
+#        libomp.dylib (compatibility version 5.0.0, current version 5.0.0)
+#        /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 88.1.3)
+#
+sub get_deps_otool($) {
+
+    my $file = shift ( @_ );
+    my $name = get_file( $file );
+    my $tool = "otool";
+    my @bulk;
+    my @deps;
+
+    if ( $target_arch eq "32e" ) {
+        # On older (Tiger) systems otool does not recognize 64-bit binaries, so try to locate
+        # otool64.
+        my $path = which( "otool64" );
+        if ( defined ( $path ) ) {
+            $tool = "otool64";
+        }; # if
+    }; # if
+
+    execute( [ $tool, "-L", $file ], -stdout => \@bulk );
+    debug( @bulk, "(eof)" );
+
+    my $i = 0;
+    # Parse the first one or two lines separately.
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\Q$file\E:$} )
+        or parse_error( $tool, @bulk, $i );
+    ++ $i;
+    if ( $name =~ m{\.dylib\z} ) {
+        # Added "@rpath/" enables dynamic load of the library designated at link time.
+        $name = '@rpath/' . $name;
+        # In case of dynamic library otool print the library itself as a dependent library.
+        ( $i < @bulk and $bulk[ $i ] =~ m{^\s+\Q$name\E\s+\(compatibility version.*\)$} )
+            or parse_error( $tool, @bulk, $i );
+        ++ $i;
+    }; # if
+
+    # Then parse the rest.
+    while ( $i < @bulk ) {
+        my $line = $bulk[ $i ];
+        if ( $line !~ m/^\s*(.*)\s+\(compatibility version\s.*\)$/ ) {
+            parse_error( $tool, @bulk, $i );
+        }; # if
+        my ( $dep ) = ( $1 );
+        push( @deps, $dep );
+        ++ $i;
+    }; # while
+
+    return @deps;
+
+}; # sub get_deps_otool
+
+
+# --------------------------------------------------------------------------------------------------
+# Windows* OS version of get_deps() parses output of link:
+#
+# > link -dump -dependents libname.dll
+# Microsoft (R) COFF/PE Dumper Version 8.00.40310.39
+# Copyright (C) Microsoft Corporation.  All rights reserved.
+# Dump of file S:\Projects.OMP\users\omalyshe\omp\libomp\exports\win_64\lib\libompmd.dll
+# File Type: DLL
+#   Image has the following dependencies:
+#     KERNEL32.dll
+#   Summary
+#         C000 .data
+#         6000 .pdata
+#        18000 .rdata
+#        ...
+#
+# > link -dump -directives libname.lib
+# Microsoft (R) COFF/PE Dumper Version 8.00.40310.39
+# Copyright (C) Microsoft Corporation.  All rights reserved.
+# Dump of file S:\Projects.OMP\users\omalyshe\omp\libomp\exports\win_32e\lib\libimp5mt.lib
+# File Type: LIBRARY
+#   Linker Directives
+#   -----------------
+#   -defaultlib:"uuid.lib"
+#   -defaultlib:"uuid.lib"
+#   .....
+#   Summary
+#       3250 .bss
+#       3FBC .data
+#         34 .data1
+#       ....
+sub get_deps_link($) {
+
+    my ( $lib ) = @_;
+    my $tool = "link";
+    my @bulk;
+    my @deps;
+
+    my $ext = lc( get_ext( $lib ) );
+    if ( $ext !~ m{\A\.(?:lib|dll|exe)\z}i ) {
+        runtime_error( "Incorrect file is specified: `$lib'; only `lib', `dll' or `exe' file expected" );
+    }; # if
+
+    execute(
+        [ $tool, "/dump", ( $ext eq ".lib" ? "/directives" : "/dependents" ), $lib ],
+        -stdout => \@bulk
+    );
+
+    debug( @bulk, "(eof)" );
+
+    my $i = 0;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Microsoft \(R\) COFF\/PE Dumper Version.*$} ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Copyright \(C\) Microsoft Corporation\..*$} ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^Dump of file\s\Q$lib\E$}                    ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^File Type:\s(.*)$}                          ) or parse_error( $tool, @bulk, $i ); ++ $i;
+    ( $i < @bulk and $bulk[ $i ] =~ m{^\s*$}                                       ) or parse_error( $tool, @bulk, $i ); ++ $i;
+
+    if ( $ext eq ".lib" ) {
+
+        my %deps;
+        while ( $i < @bulk ) {
+            my $line = $bulk[ $i ];
+            if ( 0 ) {
+            } elsif ( $line =~ m{^\s*[-/]defaultlib\:(.*)\s*$}i ) {
+                my $dep = $1;
+                # Normalize library name:
+                $dep = lc( $1 );              # Convert to lower case.
+                $dep =~ s{\A"(.*)"\z}{$1};    # Drop surrounding quotes (if any).
+                $dep =~ s{\.lib\z}{};         # Drop .lib suffix (if any).
+                $deps{ $dep } = 1;
+            } elsif ( $line =~ m{^\s*Linker Directives\s*$} ) {
+            } elsif ( $line =~ m{^\s*-+\s*$} ) {
+            } elsif ( $line =~ m{^\s*/alternatename\:.*$} ) {
+            } elsif ( $line =~ m{^\s*$} ) {
+            } elsif ( $line =~ m{^\s*/FAILIFMISMATCH\:.*$} ) {
+                # This directive is produced only by _MSC_VER=1600
+            } elsif ( $line =~ m{^\s*Summary\s*$} ) {
+                last;
+            } else {
+                parse_error( $tool, @bulk, $i );
+            }; # if
+            ++ $i;
+        } # while
+        @deps = keys( %deps );
+
+    } else {
+
+        ( $i < @bulk and $bulk[ $i ] =~ m{\s*Image has the following dependencies\:$} )
+            or parse_error( $tool, @bulk, $i );
+        ++ $i;
+        while ( $i < @bulk ) {
+            my $line = $bulk[ $i ];
+            if ( 0 ) {
+            } elsif ( $line =~ m{^\s*$} ) {
+                # Ignore empty lines.
+            } elsif ( $line =~ m{^\s*(.*\.dll)$}i ) {
+                my $dep = lc( $1 );
+                push( @deps, $dep );
+            } elsif ( $line =~ m{^\s*Summary$} ) {
+                last;
+            } else {
+                parse_error( $tool, @bulk, $i );
+            }; # if
+            ++ $i;
+        }; # while
+
+    }; # if
+
+    return @deps;
+
+}; # sub get_deps_link
+
+
+# --------------------------------------------------------------------------------------------------
+# Main.
+# --------------------------------------------------------------------------------------------------
+
+# Parse command line.
+my $expected;
+my $bare;
+Getopt::Long::Configure( "permute" );
+get_options(
+    "os=s"       => \$target_os,
+    "arch=s"     => \$target_arch,
+    "bare"       => \$bare,
+    "expected=s" => \$expected,
+);
+my @expected;
+if ( defined( $expected ) ) {
+    if ( $expected ne "none" ) {
+        @expected = sort( split( ",", $expected ) );
+        if ( $target_os eq "win" ) {
+            @expected = map( lc( $_ ), @expected );
+        }; # if
+    }; # if
+}; # if
+if ( @ARGV < 1 ) {
+    cmdline_error( "Specify a library name to check for dependencies" );
+}; # if
+if ( @ARGV > 1 ) {
+    cmdline_error( "Too many arguments" );
+}; # if
+my $lib = shift( @ARGV );
+if ( not -e $lib ){
+    runtime_error( "Specified file does not exist: \"$lib\"" );
+}; # if
+
+# Select appropriate get_deps implementation.
+if ( 0 ) {
+} elsif ( $target_os eq "lin" ) {
+    *get_deps = \*get_deps_readelf;
+} elsif ( $target_os eq "mac" ) {
+    *get_deps = \*get_deps_otool;
+} elsif ( $target_os eq "win" ) {
+    *get_deps = \*get_deps_link;
+} else {
+    runtime_error( "OS \"$target_os\" not supported" );
+}; # if
+
+# Do the work.
+my @deps = sort( get_deps( $lib ) );
+if ( $bare ) {
+    print( map( "$_\n", @deps ) );
+} else {
+    info( "Dependencies:", @deps ? map( "    $_", @deps ) : "(none)" );
+}; # if
+if ( defined( $expected ) ) {
+    my %deps = map( ( $_ => 1 ), @deps );
+    foreach my $dep ( @expected ) {
+        delete( $deps{ $dep } );
+    }; # foreach
+    my @unexpected = sort( keys( %deps ) );
+    if ( @unexpected ) {
+        runtime_error( "Unexpected dependencies:", map( "    $_", @unexpected ) );
+    }; # if
+}; # if
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<check-depends.pl> -- Check dependencies for a specified library.
+
+=head1 SYNOPSIS
+
+B<check-depends.pl> I<OPTIONS>... I<library>
+
+=head1 DESCRIPTION
+
+C<check-depends.pl> finds direct dependencies for a specified library. List of actual dependencies
+is sorted alphabetically and printed. If list of expected dependencies is specified, the scripts
+checks the library has only allowed dependencies. In case of not expected depndencies the script
+issues error message and exits with non-zero code.
+
+Linux* OS and OS X*: The script finds dependencies only for dymamic libraries. Windows* OS: The script
+finds dependencies for either static or dymamic libraries.
+
+The script uses external tools. On Linux* OS, it runs F<readelf>, on OS X* -- F<otool> (or F<otool64>),
+on Windows* OS -- F<link>.
+
+On Windows* OS dependencies are printed in lower case, case of expected dependencies ignored.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--bare>
+
+Do not use fancy formatting; produce plain, bare output: just a list of libraries,
+a library per line.
+
+=item B<--expected=>I<list>
+
+I<list> is comma-separated list of expected dependencies (or C<none>).
+If C<--expected> option specified, C<check-depends.pl> checks the specified library
+has only expected dependencies.
+
+=item B<--os=>I<str>
+
+Specify target OS (tool to use) manually.
+Useful for cross-build, when host OS is not the same as target OS.
+I<str> should be either C<lin>, C<mac>, or C<win>.
+
+=back
+
+=head2 Standard Options
+
+=over
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full documentation and exit.
+
+=item B<--quiet>
+
+Do not output informational messages.
+
+=item B<--version>
+
+Print version and exit.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<library>
+
+A name of library to find or check dependencies.
+
+=back
+
+=head1 EXAMPLES
+
+Just print library dependencies (Windows* OS):
+
+    > check-depends.pl exports/win_32/lib/libompmd.dll
+    check-depends.pl: (i) Dependencies:
+    check-depends.pl: (i)     kernel32.dll
+
+Print library dependencies, use bare output (Linux* OS):
+
+    $ check-depends.pl --bare exports/lin_32e/lib/libomp_db.so
+    libc.so.6
+    libdl.so.2
+    libpthread.so.0
+
+Check the library does not have any dependencies (OS X*):
+
+    $ check-depends.pl --expected=none exports/mac_32/lib/libomp.dylib
+    check-depends.pl: (i) Dependencies:
+    check-depends.pl: (i)     /usr/lib/libSystem.B.dylib
+    check-depends.pl: (x) Unexpected dependencies:
+    check-depends.pl: (x)     /usr/lib/libSystem.B.dylib
+    $ echo $?
+    2
+
+=cut
+
+# end of file #
+
diff --git a/final/runtime/tools/check-execstack.pl b/final/runtime/tools/check-execstack.pl
new file mode 100755
index 0000000..34f77e1
--- /dev/null
+++ b/final/runtime/tools/check-execstack.pl
@@ -0,0 +1,145 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.002";
+my $target_arch;
+
+sub execstack($) {
+    my ( $file ) = @_;
+    my @output;
+    my @stack;
+    my $tool;
+    if($target_arch eq "mic") {
+        $tool = "x86_64-k1om-linux-readelf";
+    } else {
+        $tool = "readelf";
+    }
+    execute( [ $tool, "-l", "-W", $file ], -stdout => \@output );
+    @stack = grep( $_ =~ m{\A\s*(?:GNU_)?STACK\s+}, @output );
+    if ( not @stack ) {
+        # Interpret missed "STACK" line as error.
+        runtime_error( "$file: No stack segment found; looks like stack would be executable." );
+    }; # if
+    if ( @stack > 1 ) {
+        runtime_error( "$file: More than one stack segment found.", "readelf output:", @output, "(eof)" );
+    }; # if
+    # Typical stack lines are:
+    # Linux* OS IA-32 architecture:
+    #    GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x4
+    # Linux* OS Intel(R) 64:
+    #    GNU_STACK      0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RWE 0x8
+    if ( $stack[ 0 ] !~ m{\A\s*(?:GNU_)?STACK(?:\s+0x[0-9a-f]+){5}\s+([R ][W ][E ])\s+0x[0-9a-f]+\s*\z} ) {
+        runtime_error( "$file: Cannot parse stack segment line:", ">>> $stack[ 0 ]" );
+    }; # if
+    my $attrs = $1;
+    if ( $attrs =~ m{E} ) {
+        runtime_error( "$file: Stack is executable" );
+    }; # if
+}; # sub execstack
+
+get_options(
+    "arch=s" => \$target_arch,
+);
+
+foreach my $file ( @ARGV ) {
+    execstack( $file );
+}; # foreach $file
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<check-execstack.pl> -- Check whether stack is executable, issue an error if so.
+
+=head1 SYNOPSIS
+
+B<check-execstack.pl> I<optiion>... I<file>...
+
+=head1 DESCRIPTION
+
+The script checks whether stack of specified executable file, and issues error if stack is
+executable. If stack is not executable, the script exits silently with zero exit code.
+
+The script runs C<readelf> utility to get information about specified executable file. So, the
+script fails if C<readelf> is not available. Effectively it means the script works only on Linux* OS
+(and, probably, Intel(R) Many Integrated Core Architecture).
+
+=head1 OPTIONS
+
+=over
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print program version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+A name of executable or shared object to check. Multiple files may be specified.
+
+=back
+
+=head1 EXAMPLES
+
+Check libomp.so library:
+
+    $ check-execstack.pl libomp.so
+
+=cut
+
+# end of file #
+
diff --git a/final/runtime/tools/check-instruction-set.pl b/final/runtime/tools/check-instruction-set.pl
new file mode 100755
index 0000000..455210c
--- /dev/null
+++ b/final/runtime/tools/check-instruction-set.pl
@@ -0,0 +1,320 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.004";
+my $target_os;
+my $target_arch;
+my $target_mic_arch;
+
+my $hex = qr{[0-9a-f]}i;    # hex digit.
+
+# mic-specific details.
+
+sub bad_mic_fmt($) {
+    # Before we allowed both elf64-x86-64-freebsd and elf-l1om-freebsd.
+    # Now the first one is obsolete, only elf64-l1om-freebsd is allowed.
+    my ( $fmt ) = @_;
+    if ( 0 ) {
+    } elsif ( "$target_mic_arch" eq "knf" ) {
+	    return $fmt !~ m{\Aelf64-l1om?\z};
+    } elsif ( "$target_mic_arch" eq "knc" ) {
+	    return $fmt !~ m{\Aelf64-k1om?\z};
+	} else {
+	    return 1;
+	};
+}; # sub bad_mic_fmt
+
+# Undesired instructions for mic: all x87 and some other.
+# AC: Since compiler 2010-06-30 x87 instructions are supported, removed the check of x87.
+my $mic_bad_re;
+sub bad_mic_instr($$) {
+    my ( $instr, $args ) = @_;
+    if ( "$target_mic_arch" eq "knc" ) {
+	# workaround of bad code generation on KNF Linux* OS:
+	return ( defined( $instr ) and $instr =~ $mic_bad_re );
+    } else {
+	return ( defined( $instr ) and $instr =~ $mic_bad_re or defined( $args ) and $args =~ m{xmm}i );
+    }
+}; # sub bad_mic_instr
+
+# lin_32-specific details.
+
+sub bad_ia32_fmt($) {
+    my ( $fmt ) = @_;
+    return $fmt !~ m{\Aelf32-i386\z};
+}; # sub bad_ia32_fmt
+
+my @sse2 =
+    qw{
+        movapd movupd movhpd movlpd movmskpd movsd
+        addpd addsd subpd subsd mulpd mulsd divpd divsd sqrtpd sqrtsd maxpd maxsd minpd minsd
+        andpd andnpd orpd xorpd
+        cmppd cmpsd comisd ucomisd
+        shufpd unpckhpd unpcklpd
+        cvtpd2pi cvttpd2pi cvtpi2pd cvtpd2dq cvttpd2dq cvtdq2pd cvtps2pd cvtpd2ps cvtss2sd cvtsd2ss
+        cvtsd2si cvttsd2si cvtsi2sd cvtdq2ps cvtps2dq cvttps2dq movdqa movdqu movq2dq movdq2q
+        pmuludq paddq psubq pshuflw pshufhw pshufd pslldq psrldq punpckhqdq punpcklqdq clflush
+        lfence mfence maskmovdqu movntpd movntdq movnti
+    };
+my @sse3 =
+    qw{
+        fisttp lddqu addsubps addsubpd haddps hsubps haddpd hsubpd movshdup movsldup movddup monitor
+        mwait
+    };
+my @ssse3 =
+    qw{
+        phaddw phaddsw phaddd phsubw phsubsw phsubd pabsb pabsw pabsd pmaddubsw pmulhrsw pshufb
+        psignb psignw psignd palignr
+    };
+my @sse4 =
+    (
+        # SSE4.1
+        qw{
+            pmulld pmuldq dppd dpps movntdqa blendpd blendps blendvpd blendvps pblendvb pblendw pminuw
+            pminud pminsb pminsd pmaxuw pmaxud pmaxsb pmaxsd roundps roundpd roundss roundsd extractps
+            insertps pinsrb pinsrd pinsrq pextrb pextrw pextrd pextrq pmovsxbw pmovzxbw pmovsxbd
+            pmovzxbd pmovsxwd pmovzxwd pmovsxbq pmovzxbq pmovsxwq pmovzxwq pmovsxdq pmovzxdq mpsadbw
+            phminposuw ptest pcmpeqq packusdw
+        },
+        # SSE4.2
+        qw{
+            pcmpestri pcmpestrm pcmpistri pcmpistrm pcmpgtq crc32 popcnt
+        }
+    );
+
+# Undesired instructions for IA-32 architecture: Pentium 4 (SSE2) and newer.
+# TODO: It would be much more reliable to list *allowed* instructions rather than list undesired
+# instructions. In such a case the list will be stable and not require update when SSE5 is released.
+my @ia32_bad_list = ( @sse2, @sse3, @ssse3, @sse4 );
+
+my $ia32_bad_re = qr{@{[ "^(?:" . join( "|", @ia32_bad_list ) . ")" ]}}i;
+
+sub bad_ia32_instr($$) {
+    my ( $instr, $args ) = @_;
+    return ( defined( $instr ) and $instr =~ $ia32_bad_re );
+}; # sub bad_ia32_instr
+
+sub check_file($;$$) {
+
+    my ( $file, $show_instructions, $max_instructions ) = @_;
+    my @bulk;
+
+    if ( not defined( $max_instructions ) ) {
+        $max_instructions = 100;
+    }; # if
+
+    execute( [ "x86_64-k1om-linux-objdump", "-d", $file ], -stdout => \@bulk );
+
+    my $n = 0;
+    my $errors = 0;
+    my $current_func  = "";    # Name of current fuction.
+    my $reported_func = "";    # name of last reported function.
+    foreach my $line ( @bulk ) {
+        ++ $n;
+        if ( 0 ) {
+        } elsif ( $line =~ m{^\s*$} ) {
+            # Empty line.
+            # Ignore.
+        } elsif ( $line =~ m{^In archive (.*?):\s*$} ) {
+            # In archive libomp.a:
+        } elsif ( $line =~ m{^(?:.*?):\s*file format (.*?)\s*$} ) {
+            # libomp.so:     file format elf64-x86-64-freebsd
+            # kmp_ftn_cdecl.o:     file format elf64-x86-64
+            my $fmt = $1;
+            if ( bad_fmt( $fmt ) ) {
+                runtime_error( "Invalid file format: $fmt." );
+            }; # if
+        } elsif ( $line =~ m{^Disassembly of section (.*?):\s*$} ) {
+            # Disassembly of section .plt:
+        } elsif ( $line =~ m{^$hex+ <([^>]+)>:\s*$} ) {
+            # 0000000000017e98 <__kmp_str_format@plt-0x10>:
+            $current_func = $1;
+        } elsif ( $line =~ m{^\s*\.{3}\s*$} ) {
+        } elsif ( $line =~ m{^\s*($hex+):\s+($hex$hex(?: $hex$hex)*)\s+(?:lock\s+|rex[.a-z]*\s+)?([^ ]+)(?:\s+([^#]+?))?\s*(?:#|$)} ) {
+            #   17e98:       ff 35 fa 7d 26 00       pushq  0x267dfa(%rip)        # 27fc98 <_GLOBAL_OFFSET_TABLE>
+            my ( $addr, $dump, $instr, $args ) = ( $1, $2, $3, $4 );
+            # Check this is not a bad instruction and xmm registers are not used.
+            if ( bad_instr( $instr, $args ) ) {
+                if ( $errors == 0 ) {
+                    warning( "Invalid instructions found in `$file':" );
+                }; # if
+                if ( $current_func ne $reported_func ) {
+                    warning( "    $current_func" );
+                    $reported_func = $current_func;
+                }; # if
+                ++ $errors;
+                if ( $show_instructions ) {
+                    warning( "        $line" );
+                }; # if
+                if ( $errors >= $max_instructions ) {
+                    info( "$errors invalid instructions found; scanning stopped." );
+                    last;
+                }; # if
+            }; # if
+        } else {
+            runtime_error( "Error parsing objdump output line $n:\n>>>> $line\n" );
+        }; # if
+    }; # foreach $line
+
+    return $errors;
+
+}; # sub check_file
+
+# --------------------------------------------------------------------------------------------------
+
+# Parse command line.
+my $max_instructions;
+my $show_instructions;
+get_options(
+    "os=s"               => \$target_os,
+    "arch=s"             => \$target_arch,
+    "mic-arch=s"         => \$target_mic_arch,
+    "max-instructions=i" => \$max_instructions,
+    "show-instructions!" => \$show_instructions,
+);
+my $target_platform = $target_os . "_" . $target_arch;
+if ( "$target_os" eq "lin" and "$target_mic_arch" eq "knf" ) {
+    $mic_bad_re = qr{^(?:pause|[slm]fence|scatter|gather|cmpxchg16b|clevict[12])}i;
+} else {
+    $mic_bad_re = qr{^(?:pause|[slm]fence|scatter|gather|cmov|cmpxchg16b|clevict[12])}i;
+};
+if ( 0 ) {
+} elsif ( $target_platform eq "lin_mic" ) {
+    *bad_instr = \*bad_mic_instr;
+    *bad_fmt   = \*bad_mic_fmt;
+} elsif ( $target_platform eq "lin_32" ) {
+    *bad_instr = \*bad_ia32_instr;
+    *bad_fmt   = \*bad_ia32_fmt;
+} else {
+    runtime_error( "Only works on lin_32 and lin_mic platforms." );
+}; # if
+
+# Do the work.
+my $rc = 0;
+if ( not @ARGV ) {
+    info( "No arguments specified -- nothing to do." );
+} else {
+    foreach my $arg ( @ARGV ) {
+        my $errs = check_file( $arg, $show_instructions, $max_instructions );
+        if ( $errs > 0 ) {
+            $rc = 3;
+        }; # if
+    }; # foreach $arg
+}; # if
+
+exit( $rc );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<check-instruction-set.pl> -- Make sure binary file does not contain undesired instructions.
+
+=head1 SYNOPSIS
+
+B<check-instructions.pl> I<option>... I<file>...
+
+=head1 OPTIONS
+
+=over
+
+=item B<--architecture=>I<arch>
+
+Specify target architecture.
+
+=item B<--max-instructions=>I<number>
+
+Stop scanning if I<number> invalid instructions found. 100 by default.
+
+=item B<--os=>I<os>
+
+Specify target OS.
+
+=item B<-->[B<no->]B<show-instructions>
+
+Show invalid instructions found in the file. Bu default, instructions are not shown.
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print program version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+File (object file or library, either static or dynamic) to check.
+
+=back
+
+=head1 DESCRIPTION
+
+The script runs F<objdump> utility to get disassembler listing and checks the file does not contain
+unwanted instructions.
+
+Currently the script works only for:
+
+=over
+
+=item C<lin_mic>
+
+Intel(R) Many Integrated Core Architecture target OS. Undesired unstructions are: all x87 instructions and some others.
+
+=item C<lin_32>
+
+Undesired instructions are instructions not valid for Pentium 3 processor (SSE2 and newer).
+
+=back
+
+=cut
+
diff --git a/final/runtime/tools/generate-def.pl b/final/runtime/tools/generate-def.pl
new file mode 100755
index 0000000..754243c
--- /dev/null
+++ b/final/runtime/tools/generate-def.pl
@@ -0,0 +1,323 @@
+#!/usr/bin/env perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Some pragmas.
+use strict;          # Restrict unsafe constructs.
+use warnings;        # Enable all warnings.
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.004";
+
+#
+# Subroutines.
+#
+
+sub parse_input($\%) {
+
+    my ( $input, $defs ) = @_;
+    my @bulk = read_file( $input );
+    my %entries;
+    my %ordinals;
+    my @dirs;
+    my $value = 1;
+
+    my $error =
+        sub {
+            my ( $msg, $l, $line ) = @_;
+            runtime_error(
+                "Error parsing file \"$input\" line $l:\n" .
+                "    $line" .
+                ( $msg ? $msg . "\n" : () )
+            );
+        }; # sub
+
+    my $n = 0;    # Line number.
+    foreach my $line ( @bulk ) {
+        ++ $n;
+        if ( 0 ) {
+        } elsif ( $line =~ m{^\s*(?:#|\n)} ) {
+            # Empty line or comment. Skip it.
+        } elsif ( $line =~ m{^\s*%} ) {
+            # A directive.
+            if ( 0  ) {
+            } elsif ( $line =~ m{^\s*%\s*if(n)?def\s+([A-Za-z0-9_]+)\s*(?:#|\n)} ) {
+                my ( $negation, $name ) = ( $1, $2 );
+                my $dir = { n => $n, line => $line, name => $name, value => $value };
+                push( @dirs, $dir );
+                $value = ( $value and ( $negation xor $defs->{ $name } ) );
+            } elsif ( $line =~ m{^\s*%\s*endif\s*(?:#|\n)} ) {
+                if ( not @dirs ) {
+                    $error->( "Orphan %endif directive.", $n, $line );
+                }; # if
+                my $dir = pop( @dirs );
+                $value = $dir->{ value };
+            } else {
+                $error->( "Bad directive.", $n, $line );
+            }; # if
+        } elsif ( $line =~ m{^\s*(-)?\s*([A-Za-z0-9_]+)(?:\s+(\d+|DATA))?\s*(?:#|\n)} ) {
+            my ( $obsolete, $entry, $ordinal ) = ( $1, $2, $3 );
+            if ( $value ) {
+                if ( exists( $entries{ $entry } ) ) {
+                    $error->( "Entry \"$entry\" has already been specified.", $n, $line );
+                }; # if
+                $entries{ $entry } = { ordinal => $ordinal, obsolete => defined( $obsolete ) };
+                if ( defined( $ordinal ) and $ordinal ne "DATA" ) {
+                    if ( $ordinal >= 1000 and $entry =~ m{\A[ok]mp_} ) {
+                        $error->( "Ordinal of user-callable entry must be < 1000", $n, $line );
+                    }; # if
+                    if ( $ordinal >= 1000 and $ordinal < 2000 ) {
+                        $error->( "Ordinals between 1000 and 1999 are reserved.", $n, $line );
+                    }; # if
+                    if ( exists( $ordinals{ $ordinal } ) ) {
+                        $error->( "Ordinal $ordinal has already been used.", $n, $line );
+                    }; # if
+                    $ordinals{ $ordinal } = $entry;
+                }; # if
+            }; # if
+        } else {
+            $error->( "", $n, $line );
+        }; # if
+    }; # foreach
+
+    if ( @dirs ) {
+        my $dir = pop( @dirs );
+        $error->( "Unterminated %if direcive.", $dir->{ n }, $dir->{ line } );
+    }; # while
+
+    return %entries;
+
+}; # sub parse_input
+
+sub process(\%) {
+
+    my ( $entries ) = @_;
+
+    foreach my $entry ( keys( %$entries ) ) {
+        if ( not $entries->{ $entry }->{ obsolete } ) {
+            my $ordinal = $entries->{ $entry }->{ ordinal };
+            # omp_alloc and omp_free are C/C++ only functions, skip "1000+ordinal" for them
+            if ( $entry =~ m{\A[ok]mp_} and $entry ne "omp_alloc" and $entry ne "omp_free" ) {
+                if ( not defined( $ordinal ) ) {
+                    runtime_error(
+                        "Bad entry \"$entry\": ordinal number is not specified."
+                    );
+                }; # if
+                if ( $ordinal ne "DATA" ) {
+                    $entries->{ uc( $entry ) } = { ordinal => 1000 + $ordinal };
+                }
+            }; # if
+        }; # if
+    }; # foreach
+
+    return %$entries;
+
+}; # sub process
+
+sub generate_output(\%$) {
+
+    my ( $entries, $output ) = @_;
+    my $bulk;
+
+    $bulk = "EXPORTS\n";
+    foreach my $entry ( sort( keys( %$entries ) ) ) {
+        if ( not $entries->{ $entry }->{ obsolete } ) {
+            $bulk .= sprintf( "    %-40s ", $entry );
+            my $ordinal = $entries->{ $entry }->{ ordinal };
+            if ( defined( $ordinal ) ) {
+                if ( $ordinal eq "DATA" ) {
+                    $bulk .= "DATA";
+                } else {
+                    $bulk .= "\@" . $ordinal;
+                }; # if
+            }; # if
+            $bulk .= "\n";
+        }; # if
+    }; # foreach
+    if ( defined( $output ) ) {
+        write_file( $output, \$bulk );
+    } else {
+        print( $bulk );
+    }; # if
+
+}; # sub generate_ouput
+
+#
+# Parse command line.
+#
+
+my $input;   # The name of input file.
+my $output;  # The name of output file.
+my %defs;
+
+get_options(
+    "output=s"    => \$output,
+    "D|define=s"  =>
+        sub {
+            my ( $opt_name, $opt_value ) = @_;
+            my ( $def_name, $def_value );
+            if ( $opt_value =~ m{\A(.*?)=(.*)\z} ) {
+                ( $def_name, $def_value ) = ( $1, $2 );
+            } else {
+                ( $def_name, $def_value ) = ( $opt_value, 1 );
+            }; # if
+            $defs{ $def_name } = $def_value;
+        },
+);
+
+if ( @ARGV == 0 ) {
+    cmdline_error( "Not enough arguments." );
+}; # if
+if ( @ARGV > 1 ) {
+    cmdline_error( "Too many arguments." );
+}; # if
+$input = shift( @ARGV );
+
+#
+# Work.
+#
+
+my %data = parse_input( $input, %defs );
+%data = process( %data );
+generate_output( %data, $output );
+exit( 0 );
+
+__END__
+
+#
+# Embedded documentation.
+#
+
+=pod
+
+=head1 NAME
+
+B<generate-def.pl> -- Generate def file for OpenMP RTL.
+
+=head1 SYNOPSIS
+
+B<generate-def.pl> I<OPTION>... I<file>
+
+=head1 OPTIONS
+
+=over
+
+=item B<--define=>I<name>[=I<value>]
+
+=item B<-D> I<name>[=I<value>]
+
+Define specified name. If I<value> is omitted, I<name> is defined to 1. If I<value> is 0 or empty,
+name is B<not> defined.
+
+=item B<--output=>I<file>
+
+=item B<-o> I<file>
+
+Specify output file name. If option is not present, result is printed to stdout.
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full help message and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--usage>
+
+Print very short usage message and exit.
+
+=item B<--verbose>
+
+Do print informational messages.
+
+=item B<--version>
+
+Print version and exit.
+
+=item B<--quiet>
+
+Work quiet, do not print informational messages.
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+A name of input file.
+
+=back
+
+=head1 DESCRIPTION
+
+The script reads input file, process conditional directives, checks content for consistency, and
+generates ouptput file suitable for linker.
+
+=head2 Input File Format
+
+=over
+
+=item Comments
+
+    # It's a comment.
+
+Comments start with C<#> symbol and continue to the end of line.
+
+=item Conditional Directives
+
+    %ifdef name
+    %ifndef name
+    %endif
+
+A part of file surrounded by C<%ifdef I<name>> and C<%endif> directives is a conditional part -- it
+has effect only if I<name> is defined in the comman line by B<--define> option. C<%ifndef> is a
+negated version of C<%ifdef> -- conditional part has an effect only if I<name> is B<not> defined.
+
+Conditional parts may be nested.
+
+=item Export Definitions
+
+    symbol
+    symbol ordinal
+    symbol DATA
+
+Symbols starting with C<omp_> or C<kmp_> must have ordinal specified. They are subjects for special
+processing: each symbol generates two output lines: original one and upper case version. The ordinal
+number of the second is original ordinal increased by 1000.
+
+=item Obsolete Symbols
+
+    - symbol
+    - symbol ordinal
+    - symbol DATA
+
+Obsolete symbols look like export definitions prefixed with minus sign. Obsolete symbols do not
+affect the output, but obsolete symbols and their ordinals cannot be (re)used in export definitions.
+
+=back
+
+=head1 EXAMPLES
+
+    $ generate-def.pl -D stub -D USE_TCHECK=0 -o libguide.def dllexport
+
+=cut
+
+# end of file #
+
diff --git a/final/runtime/tools/lib/Build.pm b/final/runtime/tools/lib/Build.pm
new file mode 100644
index 0000000..a24cf57
--- /dev/null
+++ b/final/runtime/tools/lib/Build.pm
@@ -0,0 +1,263 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+package Build;
+
+use strict;
+use warnings;
+
+use Cwd qw{};
+
+use LibOMP;
+use tools;
+use Uname;
+use Platform ":vars";
+
+my $host = Uname::host_name();
+my $root = $ENV{ LIBOMP_WORK    };
+my $tmp  = $ENV{ LIBOMP_TMP     };
+my $out  = $ENV{ LIBOMP_EXPORTS };
+
+my @jobs;
+our $start = time();
+
+# --------------------------------------------------------------------------------------------------
+# Helper functions.
+# --------------------------------------------------------------------------------------------------
+
+# tstr -- Time string. Returns string "yyyy-dd-mm hh:mm:ss UTC".
+sub tstr(;$) {
+    my ( $time ) = @_;
+    if ( not defined( $time ) ) {
+        $time = time();
+    }; # if
+    my ( $sec, $min, $hour, $day, $month, $year ) = gmtime( $time );
+    $month += 1;
+    $year  += 1900;
+    my $str = sprintf( "%04d-%02d-%02d %02d:%02d:%02d UTC", $year, $month, $day, $hour, $min, $sec );
+    return $str;
+}; # sub tstr
+
+# dstr -- Duration string. Returns string "hh:mm:ss".
+sub dstr($) {
+    # Get time in seconds and format it as time in hours, minutes, seconds.
+    my ( $sec ) = @_;
+    my ( $h, $m, $s );
+    $h   = int( $sec / 3600 );
+    $sec = $sec - $h * 3600;
+    $m   = int( $sec / 60 );
+    $sec = $sec - $m * 60;
+    $s   = int( $sec );
+    $sec = $sec - $s;
+    return sprintf( "%02d:%02d:%02d", $h, $m, $s );
+}; # sub dstr
+
+# rstr -- Result string.
+sub rstr($) {
+    my ( $rc ) = @_;
+    return ( $rc == 0 ? "+++ Success +++" : "--- Failure ---" );
+}; # sub rstr
+
+sub shorter($;$) {
+    # Return shorter variant of path -- either absolute or relative.
+    my ( $path, $base ) = @_;
+    my $abs = abs_path( $path );
+    my $rel = rel_path( $path, $base );
+    if ( $rel eq "" ) {
+        $rel = ".";
+    }; # if
+    $path = ( length( $rel ) < length( $abs ) ? $rel : $abs );
+    if ( $target_os eq "win" ) {
+        $path =~ s{\\}{/}g;
+    }; # if
+    return $path;
+}; # sub shorter
+
+sub tee($$) {
+
+    my ( $action, $file ) = @_;
+    my $pid = 0;
+
+    my $save_stdout = Symbol::gensym();
+    my $save_stderr = Symbol::gensym();
+
+    # --- redirect stdout ---
+    STDOUT->flush();
+    # Save stdout in $save_stdout.
+    open( $save_stdout, ">&" . STDOUT->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Redirect stdout to tee or to file.
+    if ( $tools::verbose ) {
+        $pid = open( STDOUT, "| tee -a \"$file\"" )
+            or die "Cannot open pipe to \"tee\": $!; stopped";
+    } else {
+        open( STDOUT, ">>$file" )
+            or die "Cannot open file \"$file\" for writing: $!; stopped";
+    }; # if
+
+    # --- redirect stderr ---
+    STDERR->flush();
+    # Save stderr in $save_stderr.
+    open( $save_stderr, ">&" . STDERR->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Redirect stderr to stdout.
+    open( STDERR, ">&" . STDOUT->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+
+    # Perform actions.
+    $action->();
+
+    # --- restore stderr ---
+    STDERR->flush();
+    # Restore stderr from $save_stderr.
+    open( STDERR, ">&" . $save_stderr->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Close $save_stderr.
+    $save_stderr->close() or die ( "Cannot close filehandle: $!; stopped" );
+
+    # --- restore stdout ---
+    STDOUT->flush();
+    # Restore stdout from $save_stdout.
+    open( STDOUT, ">&" . $save_stdout->fileno() )
+        or die( "Cannot dup filehandle: $!; stopped" );
+    # Close $save_stdout.
+    $save_stdout->close() or die ( "Cannot close filehandle: $!; stopped" );
+
+    # Wait for the child tee process, otherwise output of make and build.pl interleaves.
+    if ( $pid != 0 ) {
+        waitpid( $pid, 0 );
+    }; # if
+
+}; # sub tee
+
+sub log_it($$@) {
+    my ( $title, $format, @args ) = @_;
+    my $message  = sprintf( $format, @args );
+    my $progress = cat_file( $tmp, sprintf( "%s-%s.log", $target_platform, Uname::host_name() ) );
+    if ( $title ne "" and $message ne "" ) {
+        my $line = sprintf( "%-15s : %s\n", $title, $message );
+        info( $line );
+        write_file( $progress, tstr() . ": " . $line, -append => 1 );
+    } else {
+        write_file( $progress, "\n", -append => 1 );
+    }; # if
+}; # sub log_it
+
+sub progress($$@) {
+    my ( $title, $format, @args ) = @_;
+    log_it( $title, $format, @args );
+}; # sub progress
+
+sub summary() {
+    my $total   = @jobs;
+    my $success = 0;
+    my $finish = time();
+    foreach my $job ( @jobs ) {
+        my ( $build_dir, $rc ) = ( $job->{ build_dir }, $job->{ rc } );
+        progress( rstr( $rc ), "%s", $build_dir );
+        if ( $rc == 0 ) {
+            ++ $success;
+        }; # if
+    }; # foreach $job
+    my $failure = $total - $success;
+    progress( "Successes",      "%3d of %3d", $success, $total );
+    progress( "Failures",       "%3d of %3d", $failure, $total );
+    progress( "Time elapsed",   "  %s", dstr( $finish - $start ) );
+    progress( "Overall result", "%s", rstr( $failure ) );
+    return $failure;
+}; # sub summary
+
+# --------------------------------------------------------------------------------------------------
+# Worker functions.
+# --------------------------------------------------------------------------------------------------
+
+sub init() {
+    make_dir( $tmp );
+}; # sub init
+
+sub clean(@) {
+    # Clean directories.
+    my ( @dirs ) = @_;
+    my $exit = 0;
+    # Mimisc makefile -- print a command.
+    print( "rm -f -r " . join( " ", map( shorter( $_ ) . "/*", @dirs ) ) . "\n" );
+    $exit =
+        execute(
+            [ $^X, cat_file( $ENV{ LIBOMP_WORK }, "tools", "clean-dir.pl" ), @dirs ],
+            -ignore_status => 1,
+            ( $tools::verbose ? () : ( -stdout => undef, -stderr => "" ) ),
+        );
+    return $exit;
+}; # sub clean
+
+sub make($$$) {
+    # Change dir to build one and run make.
+    my ( $job, $clean, $marker ) = @_;
+    my $dir      = $job->{ build_dir };
+    my $makefile = $job->{ makefile };
+    my $args     = $job->{ make_args };
+    my $cwd      = Cwd::cwd();
+    my $width    = -10;
+
+    my $exit;
+    $dir = cat_dir( $tmp, $dir );
+    make_dir( $dir );
+    change_dir( $dir );
+
+    my $actions =
+        sub {
+            my $start = time();
+            $makefile = shorter( $makefile );
+            print( "-" x 79, "\n" );
+            printf( "%${width}s: %s\n", "Started",   tstr( $start ) );
+            printf( "%${width}s: %s\n", "Root dir",  $root );
+            printf( "%${width}s: %s\n", "Build dir", shorter( $dir, $root ) );
+            printf( "%${width}s: %s\n", "Makefile",  $makefile );
+            print( "-" x 79, "\n" );
+            {
+                # Use shorter LIBOMP_WORK to have shorter command lines.
+                # Note: Some tools may not work if current dir is changed.
+                local $ENV{ LIBOMP_WORK } = shorter( $ENV{ LIBOMP_WORK } );
+                $exit =
+                    execute(
+                        [
+                            "make",
+                            "-r",
+                            "-f", $makefile,
+                            "arch=" . $target_arch,
+                            "marker=$marker",
+                            @$args
+                        ],
+                        -ignore_status => 1
+                    );
+                if ( $clean and $exit == 0 ) {
+                    $exit = clean( $dir );
+                }; # if
+            }
+            my $finish = time();
+            print( "-" x 79, "\n" );
+            printf( "%${width}s: %s\n", "Finished", tstr( $finish ) );
+            printf( "%${width}s: %s\n", "Elapsed", dstr( $finish - $start ) );
+            printf( "%${width}s: %s\n", "Result", rstr( $exit ) );
+            print( "-" x 79, "\n" );
+            print( "\n" );
+        }; # sub
+    tee( $actions, "build.log" );
+
+    change_dir( $cwd );
+
+    # Save completed job to be able print summary later.
+    $job->{ rc } = $exit;
+    push( @jobs, $job );
+
+    return $exit;
+
+}; # sub make
+
+1;
diff --git a/final/runtime/tools/lib/LibOMP.pm b/final/runtime/tools/lib/LibOMP.pm
new file mode 100644
index 0000000..cff7e4a
--- /dev/null
+++ b/final/runtime/tools/lib/LibOMP.pm
@@ -0,0 +1,84 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+package LibOMP;
+
+use strict;
+use warnings;
+
+use tools;
+
+sub empty($) {
+    my ( $var ) = @_;
+    return ((not exists($ENV{$var})) or (not defined($ENV{$var})) or ($ENV{$var} eq ""));
+}; # sub empty
+
+my ( $base, $out, $tmp );
+if ( empty( "LIBOMP_WORK" ) ) {
+    # $FindBin::Bin is not used intentionally because it gives real path. I want to use absolute,
+    # but not real one (real path does not contain symlinks while absolute path may contain
+    # symlinks).
+    $base = get_dir( get_dir( abs_path( $0 ) ) );
+} else {
+    $base = abs_path( $ENV{ LIBOMP_WORK } );
+}; # if
+
+if ( empty( "LIBOMP_EXPORTS" ) ) {
+    $out = cat_dir( $base, "exports" );
+} else {
+    $out = abs_path( $ENV{ LIBOMP_EXPORTS } );
+}; # if
+
+if ( empty( "LIBOMP_TMP" ) ) {
+    $tmp = cat_dir( $base, "tmp" );
+} else {
+    $tmp = abs_path( $ENV{ LIBOMP_TMP } );
+}; # if
+
+$ENV{ LIBOMP_WORK    } = $base;
+$ENV{ LIBOMP_EXPORTS } = $out;
+$ENV{ LIBOMP_TMP     } = $tmp;
+
+return 1;
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<LibOMP.pm> --
+
+=head1 SYNOPSIS
+
+    use FindBin;
+    use lib "$FindBin::Bin/lib";
+    use LibOMP;
+
+    $ENV{ LIBOMP_WORK    }
+    $ENV{ LIBOMP_TMP     }
+    $ENV{ LIBOMP_EXPORTS }
+
+=head1 DESCRIPTION
+
+The module checks C<LIBOMP_WORK>, C<LIBOMP_EXPORTS>, and C<LIBOMP_TMP> environments variables.
+If a variable set, the module makes sure it is absolute. If a variable does not exist, the module
+sets it to default value.
+
+Default value for C<LIBOMP_EXPORTS> is C<$LIBOMP_WORK/exports>, for C<LIBOMP_TMP> --
+C<$LIBOMP_WORK/tmp>.
+
+Value for C<LIBOMP_WORK> is guessed. The module assumes the script (which uses the module) is
+located in C<tools/> directory of libomp directory tree, and uses path of the script to calculate
+C<LIBOMP_WORK>,
+
+=cut
+
+# end of file #
+
diff --git a/final/runtime/tools/lib/Platform.pm b/final/runtime/tools/lib/Platform.pm
new file mode 100644
index 0000000..b0e10a1
--- /dev/null
+++ b/final/runtime/tools/lib/Platform.pm
@@ -0,0 +1,483 @@
+#
+# This is not a runnable script, it is a Perl module, a collection of variables, subroutines, etc.
+# to be used in Perl scripts.
+#
+# To get help about exported variables and subroutines, execute the following command:
+#
+#     perldoc Platform.pm
+#
+# or see POD (Plain Old Documentation) imbedded to the source...
+#
+#
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+package Platform;
+
+use strict;
+use warnings;
+
+use base "Exporter";
+
+use Uname;
+
+my @vars;
+
+BEGIN {
+    @vars = qw{ $host_arch $host_os $host_platform $target_arch $target_mic_arch $target_os $target_platform };
+}
+
+our $VERSION     = "0.014";
+our @EXPORT      = qw{};
+our @EXPORT_OK   = ( qw{ canon_arch canon_os canon_mic_arch legal_arch arch_opt }, @vars );
+our %EXPORT_TAGS = ( all => [ @EXPORT_OK ], vars => \@vars );
+
+# Canonize architecture name.
+sub canon_arch($) {
+    my ( $arch ) = @_;
+    if ( defined( $arch ) ) {
+        if ( $arch =~ m{\A\s*(?:32|IA-?32|IA-?32 architecture|i[3456]86|x86)\s*\z}i ) {
+            $arch = "32";
+        } elsif ( $arch =~ m{\A\s*(?:48|(?:ia)?32e|Intel\s*64|Intel\(R\)\s*64|x86[_-]64|x64|AMD64)\s*\z}i ) {
+            $arch = "32e";
+        } elsif ( $arch =~ m{\Aarm(?:v7\D*)?\z} ) {
+            $arch = "arm";
+        } elsif ( $arch =~ m{\Appc64le} ) {
+			$arch = "ppc64le";
+        } elsif ( $arch =~ m{\Appc64} ) {
+        	$arch = "ppc64";
+        } elsif ( $arch =~ m{\Aaarch64} ) {
+                $arch = "aarch64";
+        } elsif ( $arch =~ m{\Amic} ) {
+            $arch = "mic";
+        } elsif ( $arch =~ m{\Amips64} ) {
+            $arch = "mips64";
+        } elsif ( $arch =~ m{\Amips} ) {
+            $arch = "mips";
+        } else {
+            $arch = undef;
+        }; # if
+    }; # if
+    return $arch;
+}; # sub canon_arch
+
+# Canonize Intel(R) Many Integrated Core Architecture name.
+sub canon_mic_arch($) {
+    my ( $mic_arch ) = @_;
+    if ( defined( $mic_arch ) ) {
+        if ( $mic_arch =~ m{\Aknf} ) {
+            $mic_arch = "knf";
+        } elsif ( $mic_arch =~ m{\Aknc}) {
+            $mic_arch = "knc";
+        } elsif ( $mic_arch =~ m{\Aknl} ) {
+            $mic_arch = "knl";
+        } else {
+            $mic_arch = undef;
+        }; # if
+    }; # if
+    return $mic_arch;
+}; # sub canon_mic_arch
+
+{  # Return legal approved architecture name.
+    my %legal = (
+        "32"  => "IA-32 architecture",
+        "32e" => "Intel(R) 64",
+        "arm" => "ARM",
+        "aarch64" => "AArch64",
+        "mic" => "Intel(R) Many Integrated Core Architecture",
+        "mips" => "MIPS",
+        "mips64" => "MIPS64",
+    );
+
+    sub legal_arch($) {
+        my ( $arch ) = @_;
+        $arch = canon_arch( $arch );
+        if ( defined( $arch ) ) {
+            $arch = $legal{ $arch };
+        }; # if
+        return $arch;
+    }; # sub legal_arch
+}
+
+{  # Return architecture name suitable for Intel compiler setup scripts.
+    my %option = (
+        "32"  => "ia32",
+        "32e" => "intel64",
+        "64"  => "ia64",
+        "arm" => "arm",
+        "aarch64" => "aarch",
+        "mic" => "intel64",
+        "mips" => "mips",
+        "mips64" => "MIPS64",
+    );
+
+    sub arch_opt($) {
+        my ( $arch ) = @_;
+        $arch = canon_arch( $arch );
+        if ( defined( $arch ) ) {
+            $arch = $option{ $arch };
+        }; # if
+        return $arch;
+    }; # sub arch_opt
+}
+
+# Canonize OS name.
+sub canon_os($) {
+    my ( $os ) = @_;
+    if ( defined( $os ) ) {
+        if ( $os =~ m{\A\s*(?:Linux|lin|l)\s*\z}i ) {
+            $os = "lin";
+        } elsif ( $os =~ m{\A\s*(?:Mac(?:\s*OS(?:\s*X)?)?|mac|m|Darwin)\s*\z}i ) {
+            $os = "mac";
+        } elsif ( $os =~ m{\A\s*(?:Win(?:dows)?(?:(?:_|\s*)?(?:NT|XP|95|98|2003))?|w)\s*\z}i ) {
+            $os = "win";
+        } else {
+            $os = undef;
+        }; # if
+    }; # if
+    return $os;
+}; # sub canon_os
+
+my ( $_host_os, $_host_arch, $_target_os, $_target_arch, $_target_mic_arch, $_default_mic_arch);
+
+# Set the default mic-arch value.
+$_default_mic_arch = "knc";
+
+sub set_target_arch($) {
+    my ( $arch ) = canon_arch( $_[ 0 ] );
+    if ( defined( $arch ) ) {
+        $_target_arch       = $arch;
+        $ENV{ LIBOMP_ARCH } = $arch;
+    }; # if
+    return $arch;
+}; # sub set_target_arch
+
+sub set_target_mic_arch($) {
+    my ( $mic_arch ) = canon_mic_arch( $_[ 0 ] );
+    if ( defined( $mic_arch ) ) {
+        $_target_mic_arch       = $mic_arch;
+        $ENV{ LIBOMP_MIC_ARCH } = $mic_arch;
+    }; # if
+    return $mic_arch;
+}; # sub set_target_mic_arch
+
+sub set_target_os($) {
+    my ( $os ) = canon_os( $_[ 0 ] );
+    if ( defined( $os ) ) {
+        $_target_os       = $os;
+        $ENV{ LIBOMP_OS } = $os;
+    }; # if
+    return $os;
+}; # sub set_target_os
+
+sub target_options() {
+    my @options = (
+        "target-os|os=s" =>
+            sub {
+                set_target_os( $_[ 1 ] ) or
+                    die "Bad value of --target-os option: \"$_[ 1 ]\"\n";
+            },
+        "target-architecture|targert-arch|architecture|arch=s" =>
+           sub {
+               set_target_arch( $_[ 1 ] ) or
+                   die "Bad value of --target-architecture option: \"$_[ 1 ]\"\n";
+           },
+        "target-mic-architecture|targert-mic-arch|mic-architecture|mic-arch=s" =>
+           sub {
+               set_target_mic_arch( $_[ 1 ] ) or
+                   die "Bad value of --target-mic-architecture option: \"$_[ 1 ]\"\n";
+           },
+    );
+    return @options;
+}; # sub target_options
+
+# Detect host arch.
+{
+    my $hardware_platform = Uname::hardware_platform();
+    if ( 0 ) {
+    } elsif ( $hardware_platform eq "i386" ) {
+        $_host_arch = "32";
+    } elsif ( $hardware_platform eq "ia64" ) {
+        $_host_arch = "64";
+    } elsif ( $hardware_platform eq "x86_64" ) {
+        $_host_arch = "32e";
+    } elsif ( $hardware_platform eq "arm" ) {
+        $_host_arch = "arm";
+    } elsif ( $hardware_platform eq "ppc64le" ) {
+        $_host_arch = "ppc64le";
+    } elsif ( $hardware_platform eq "ppc64" ) {
+        $_host_arch = "ppc64";
+    } elsif ( $hardware_platform eq "aarch64" ) {
+        $_host_arch = "aarch64";
+    } elsif ( $hardware_platform eq "mips64" ) {
+        $_host_arch = "mips64";
+    } elsif ( $hardware_platform eq "mips" ) {
+        $_host_arch = "mips";
+    } else {
+        die "Unsupported host hardware platform: \"$hardware_platform\"; stopped";
+    }; # if
+}
+
+# Detect host OS.
+{
+    my $operating_system = Uname::operating_system();
+    if ( 0 ) {
+    } elsif ( $operating_system eq "GNU/Linux" ) {
+        $_host_os = "lin";
+    } elsif ( $operating_system eq "FreeBSD" ) {
+        # Host OS resembles Linux.
+        $_host_os = "lin";
+    } elsif ( $operating_system eq "NetBSD" ) {
+        # Host OS resembles Linux.
+        $_host_os = "lin";
+    } elsif ( $operating_system eq "Darwin" ) {
+        $_host_os = "mac";
+    } elsif ( $operating_system eq "MS Windows" ) {
+        $_host_os = "win";
+    } else {
+        die "Unsupported host operating system: \"$operating_system\"; stopped";
+    }; # if
+}
+
+# Detect target arch.
+if ( defined( $ENV{ LIBOMP_ARCH } ) ) {
+    # Use arch specified in LIBOMP_ARCH.
+    $_target_arch = canon_arch( $ENV{ LIBOMP_ARCH } );
+    if ( not defined( $_target_arch ) ) {
+        die "Unknown architecture specified in LIBOMP_ARCH environment variable: \"$ENV{ LIBOMP_ARCH }\"";
+    }; # if
+} else {
+    # Otherwise use host architecture.
+    $_target_arch = $_host_arch;
+}; # if
+$ENV{ LIBOMP_ARCH } = $_target_arch;
+
+# Detect target Intel(R) Many Integrated Core Architecture.
+if ( defined( $ENV{ LIBOMP_MIC_ARCH } ) ) {
+    # Use mic arch specified in LIBOMP_MIC_ARCH.
+    $_target_mic_arch = canon_mic_arch( $ENV{ LIBOMP_MIC_ARCH } );
+    if ( not defined( $_target_mic_arch ) ) {
+        die "Unknown architecture specified in LIBOMP_MIC_ARCH environment variable: \"$ENV{ LIBOMP_MIC_ARCH }\"";
+    }; # if
+} else {
+    # Otherwise use default Intel(R) Many Integrated Core Architecture.
+    $_target_mic_arch = $_default_mic_arch;
+}; # if
+$ENV{ LIBOMP_MIC_ARCH } = $_target_mic_arch;
+
+# Detect target OS.
+if ( defined( $ENV{ LIBOMP_OS } ) ) {
+    # Use OS specified in LIBOMP_OS.
+    $_target_os = canon_os( $ENV{ LIBOMP_OS } );
+    if ( not defined( $_target_os ) ) {
+        die "Unknown OS specified in LIBOMP_OS environment variable: \"$ENV{ LIBOMP_OS }\"";
+    }; # if
+} else {
+    # Otherwise use host OS.
+    $_target_os = $_host_os;
+}; # if
+$ENV{ LIBOMP_OS } = $_target_os;
+
+use vars @vars;
+
+tie( $host_arch,       "Platform::host_arch" );
+tie( $host_os,         "Platform::host_os" );
+tie( $host_platform,   "Platform::host_platform" );
+tie( $target_arch,     "Platform::target_arch" );
+tie( $target_mic_arch, "Platform::target_mic_arch" );
+tie( $target_os,       "Platform::target_os" );
+tie( $target_platform, "Platform::target_platform" );
+
+{ package Platform::base;
+
+    use Carp;
+
+    use Tie::Scalar;
+    use base "Tie::StdScalar";
+
+    sub STORE {
+        my $self = shift( @_ );
+        croak( "Modifying \$" . ref( $self ) . " is not allowed; stopped" );
+    }; # sub STORE
+
+} # package Platform::base
+
+{ package Platform::host_arch;
+    use base "Platform::base";
+    sub FETCH {
+        return $_host_arch;
+    }; # sub FETCH
+} # package Platform::host_arch
+
+{ package Platform::host_os;
+    use base "Platform::base";
+    sub FETCH {
+        return $_host_os;
+    }; # sub FETCH
+} # package Platform::host_os
+
+{ package Platform::host_platform;
+    use base "Platform::base";
+    sub FETCH {
+        return "${_host_os}_${_host_arch}";
+    }; # sub FETCH
+} # package Platform::host_platform
+
+{ package Platform::target_arch;
+    use base "Platform::base";
+    sub FETCH {
+        return $_target_arch;
+    }; # sub FETCH
+} # package Platform::target_arch
+
+{ package Platform::target_mic_arch;
+    use base "Platform::base";
+    sub FETCH {
+        return $_target_mic_arch;
+    }; # sub FETCH
+} # package Platform::target_mic_arch
+
+{ package Platform::target_os;
+    use base "Platform::base";
+    sub FETCH {
+        return $_target_os;
+    }; # sub FETCH
+} # package Platform::target_os
+
+{ package Platform::target_platform;
+    use base "Platform::base";
+    sub FETCH {
+        if ($_target_arch eq "mic") {
+            return "${_target_os}_${_target_mic_arch}";
+        } else {
+        return "${_target_os}_${_target_arch}";
+        }
+    }; # sub FETCH
+} # package Platform::target_platform
+
+
+return 1;
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<Platform.pm> -- Few subroutines to get OS, architecture and platform name in form suitable for
+naming files, directories, macros, etc.
+
+=head1 SYNOPSIS
+
+    use Platform ":all";
+    use tools;
+
+    my $arch   = canon_arch( "em64T" );        # Returns "32e".
+    my $legal  = legal_arch( "em64t" );        # Returns "Intel(R) 64".
+    my $option = arch_opt( "em64t" );          # Returns "intel64".
+    my $os     = canon_os( "Windows NT" );     # Returns "win".
+
+    print( $host_arch, $host_os, $host_platform );
+    print( $taregt_arch, $target_os, $target_platform );
+
+    tools::get_options(
+        Platform::target_options(),
+        ...
+    );
+
+
+=head1 DESCRIPTION
+
+Environment variable LIBOMP_OS specifies target OS to report. If LIBOMP_OS id not defined,
+the script assumes host OS is target OS.
+
+Environment variable LIBOMP_ARCH specifies target architecture to report. If LIBOMP_ARCH is not defined,
+the script assumes host architecture is target one.
+
+=head2 Functions.
+
+=over
+
+=item B<canon_arch( $arch )>
+
+Input string is an architecture name to canonize. The function recognizes many variants, for example:
+C<32e>, C<Intel64>, C<Intel(R) 64>, etc. Returned string is a canononized architecture name,
+one of: C<32>, C<32e>, C<64>, C<arm>, C<ppc64le>, C<ppc64>, C<mic>, C<mips>, C<mips64>, or C<undef> is input string is not recognized.
+
+=item B<legal_arch( $arch )>
+
+Input string is architecture name. The function recognizes the same variants as C<arch_canon()> does.
+Returned string is a name approved by Intel Legal, one of: C<IA-32 architecture>, C<Intel(R) 64>
+or C<undef> if input string is not recognized.
+
+=item B<arch_opt( $arch )>
+
+Input string is architecture name. The function recognizes the same variants as C<arch_canon()> does.
+Returned string is an architecture name suitable for passing to compiler setup scripts
+(e. g. C<iccvars.sh>), one of: C<IA-32 architecture>, C<Intel(R) 64> or C<undef> if input string is not
+recognized.
+
+=item B<canon_os( $os )>
+
+Input string is OS name to canonize. The function recognizes many variants, for example: C<mac>, C<OS X>, etc. Returned string is a canonized OS name, one of: C<lin>,
+C<mac>, C<win>, or C<undef> is input string is not recognized.
+
+=item B<target_options()>
+
+Returns array suitable for passing to C<tools::get_options()> to let a script recognize
+C<--target-architecture=I<str>> and C<--target-os=I<str>> options. Typical usage is:
+
+    use tools;
+    use Platform;
+
+    my ( $os, $arch, $platform );    # Global variables, not initialized.
+
+    ...
+
+    get_options(
+        Platform::target_options(),  # Let script recognize --target-os and --target-arch options.
+        ...
+    );
+    # Initialize variabls after parsing command line.
+    ( $os, $arch, $platform ) = ( Platform::target_os(), Platform::target_arch(), Platform::target_platform() );
+
+=back
+
+=head2 Variables
+
+=item B<$host_arch>
+
+Canonized name of host architecture.
+
+=item B<$host_os>
+
+Canonized name of host OS.
+
+=item B<$host_platform>
+
+Host platform name (concatenated canonized OS name, underscore, and canonized architecture name).
+
+=item B<$target_arch>
+
+Canonized name of target architecture.
+
+=item B<$target_os>
+
+Canonized name of target OS.
+
+=item B<$target_platform>
+
+Target platform name (concatenated canonized OS name, underscore, and canonized architecture name).
+
+=back
+
+=cut
+
+# end of file #
diff --git a/final/runtime/tools/lib/Uname.pm b/final/runtime/tools/lib/Uname.pm
new file mode 100644
index 0000000..4a5c332
--- /dev/null
+++ b/final/runtime/tools/lib/Uname.pm
@@ -0,0 +1,638 @@
+#
+# This is not a runnable script, it is a Perl module, a collection of variables, subroutines, etc.
+# To get help about exported variables and subroutines, execute the following command:
+#
+#     perldoc Uname.pm
+#
+# or see POD (Plain Old Documentation) embedded to the source...
+#
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+package Uname;
+
+use strict;
+use warnings;
+use warnings::register;
+use Exporter;
+
+use POSIX;
+use File::Glob ":glob";
+use Net::Domain qw{};
+
+# Following code does not work with Perl 5.6 on Linux* OS and Windows* OS:
+#
+#     use if $^O eq "darwin", tools => qw{};
+#
+# The workaround for Perl 5.6:
+#
+BEGIN {
+    if ( $^O eq "darwin" or $^O eq "linux" ) {
+	require tools;
+        import tools;
+    }; # if
+    if ( $^O eq "MSWin32" ) {
+        require Win32;
+    }; # if
+}; # BEGIN
+
+my $mswin = qr{\A(?:MSWin32|Windows_NT)\z};
+
+my @posix = qw{ kernel_name fqdn kernel_release kernel_version machine };
+    # Properties supported by POSIX::uname().
+my @linux =
+    qw{ processor hardware_platform operating_system };
+    # Properties reported by uname in Linux* OS.
+my @base = ( @posix, @linux );
+    # Base properties.
+my @aux =
+    (
+        qw{ host_name domain_name },
+        map( "operating_system_$_", qw{ name release codename description } )
+    );
+    # Auxiliary properties.
+my @all = ( @base, @aux );
+    # All the properties.
+my @meta = qw{ base_names all_names value };
+    # Meta functions.
+
+our $VERSION     = "0.07";
+our @ISA         = qw{ Exporter };
+our @EXPORT      = qw{};
+our @EXPORT_OK   = ( @all, @meta );
+our %EXPORT_TAGS =
+    (
+        base => [ @base ],
+        all  => [ @all  ],
+        meta => [ @meta ],
+    );
+
+my %values;
+    # Hash of values. Some values are strings, some may be references to code which should be
+    # evaluated to get real value. This trick is implemented because call to Net::Domain::hostfqdn()
+    # is relatively slow.
+
+# Get values from POSIX::uname().
+@values{ @posix } = POSIX::uname();
+
+# On some systems POSIX::uname() returns "short" node name (without domain name). To be consistent
+# on all systems, we will get node name from alternative source.
+if ( $^O =~ m/cygwin/i ) {
+    # Function from Net::Domain module works well, but on Cygwin it prints to
+    # stderr "domainname: not found". So we will use environment variables for now.
+    $values{ fqdn } = lc( $ENV{ COMPUTERNAME } . "." . $ENV{ USERDNSDOMAIN } );
+} else {
+    # On systems other than Cygwin, let us use Net::Domain::hostfqdn(), but do it only node name
+    # is really requested.
+    $values{ fqdn } =
+        sub {
+            my $fqdn = Net::Domain::hostfqdn(); # "fqdn" stands for "fully qualified doamain name".
+            # On some systems POSIX::uname() and Net::Domain::hostfqdn() reports different names.
+            # Let us issue a warning if they significantly different. Names are insignificantly
+            # different if POSIX::uname() matches the beginning of Net::Domain::hostfqdn().
+            if (
+                $fqdn eq substr( $fqdn, 0, length( $fqdn ) )
+                &&
+                (
+                    length( $fqdn ) == length( $fqdn )
+                    ||
+                    substr( $fqdn, length( $fqdn ), 1 ) eq "."
+                )
+            ) {
+                # Ok.
+            } else {
+                warnings::warnif(
+                    "POSIX::uname() and Net::Domain::hostfqdn() reported different names: " .
+                        "\"$values{ fqdn }\" and \"$fqdn\" respectively\n"
+                );
+            }; # if
+            return $fqdn;
+        }; # sub
+}; # if
+
+if ( $^O =~ $mswin ) {
+    if (
+        $values{ machine } =~ m{\A(?:x86|[56]86)\z}
+        and
+        exists( $ENV{ PROCESSOR_ARCHITECTURE } ) and $ENV{ PROCESSOR_ARCHITECTURE } eq "x86"
+        and
+        exists( $ENV{ PROCESSOR_ARCHITEW6432 } )
+    ) {
+        if ( $ENV{ PROCESSOR_ARCHITEW6432 } eq "AMD64" ) {
+            $values{ machine } = "x86_64";
+        }; # if
+    }; # if
+}; # if
+
+# Some values are not returned by POSIX::uname(), let us compute them.
+
+# processor.
+$values{ processor } = $values{ machine };
+
+# hardware_platform.
+if ( 0 ) {
+} elsif ( $^O eq "linux" or $^O eq "freebsd" or $^O eq "netbsd" ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } =~ m{\Ai[3456]86\z} ) {
+        $values{ hardware_platform } = "i386";
+    } elsif ( $values{ machine } =~ m{\A(x86_64|amd64)\z} ) {
+        $values{ hardware_platform } = "x86_64";
+    } elsif ( $values{ machine } =~ m{\Aarmv7\D*\z} ) {
+        $values{ hardware_platform } = "arm";
+    } elsif ( $values{ machine } =~ m{\Appc64le\z} ) {
+        $values{ hardware_platform } = "ppc64le";
+    } elsif ( $values{ machine } =~ m{\Appc64\z} ) {
+        $values{ hardware_platform } = "ppc64";
+    } elsif ( $values{ machine } =~ m{\Aaarch64\z} ) {
+        $values{ hardware_platform } = "aarch64";
+    } elsif ( $values{ machine } =~ m{\Amips64\z} ) {
+        $values{ hardware_platform } = "mips64";
+    } elsif ( $values{ machine } =~ m{\Amips\z} ) {
+        $values{ hardware_platform } = "mips";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} elsif ( $^O eq "darwin" ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } eq "x86" or $values{ machine } eq "i386" ) {
+        $values{ hardware_platform } =
+            sub {
+                my $platform = "i386";
+                # Some OSes on Intel(R) 64 still reports "i386" machine. Verify it by using
+                # the value returned by 'sysctl -n hw.optional.x86_64'. On Intel(R) 64-bit systems the
+                # value == 1; on 32-bit systems the 'hw.optional.x86_64' property either does not exist
+                # or the value == 0. The path variable does not contain a path to sysctl when
+                # started by crontab.
+                my $sysctl = ( which( "sysctl" ) or "/usr/sbin/sysctl" );
+                my $output;
+                debug( "Executing $sysctl..." );
+                execute( [ $sysctl, "-n", "hw.optional.x86_64" ], -stdout => \$output, -stderr => undef );
+                chomp( $output );
+                if ( 0 ) {
+                } elsif ( "$output" eq "" or "$output" eq "0" ) {
+                    $platform = "i386";
+                } elsif ( "$output" eq "1" ) {
+                    $platform = "x86_64";
+                } else {
+                    die "Unsupported value (\"$output\") returned by \"$sysctl -n hw.optional.x86_64\"; stopped";
+                }; # if
+                return $platform;
+            }; # sub {
+    } elsif ( $values{ machine } eq "x86_64" ) {
+	# Some OS X* versions report "x86_64".
+	$values{ hardware_platform } = "x86_64";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} elsif ( $^O =~ $mswin ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } =~ m{\A(?:x86|[56]86)\z} ) {
+        $values{ hardware_platform } = "i386";
+    } elsif ( $values{ machine } eq "x86_64" or $values{ machine } eq "amd64" ) {
+        # ActivePerl for IA-32 architecture returns "x86_64", while ActivePerl for Intel(R) 64 returns "amd64".
+        $values{ hardware_platform } = "x86_64";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} elsif ( $^O eq "cygwin" ) {
+    if ( 0 ) {
+    } elsif ( $values{ machine } =~ m{\Ai[3456]86\z} ) {
+        $values{ hardware_platform } = "i386";
+    } elsif ( $values{ machine } eq "x86_64" ) {
+        $values{ hardware_platform } = "x86_64";
+    } else {
+        die "Unsupported machine (\"$values{ machine }\") returned by POSIX::uname(); stopped";
+    }; # if
+} else {
+    die "Unsupported OS (\"$^O\"); stopped";
+}; # if
+
+# operating_system.
+if ( 0 ) {
+} elsif ( $values{ kernel_name } eq "Linux" ) {
+    $values{ operating_system } = "GNU/Linux";
+    my $release;    # Name of chosen "*-release" file.
+    my $bulk;       # Content of release file.
+    # On Ubuntu, lsb-release is quite informative, e. g.:
+    #     DISTRIB_ID=Ubuntu
+    #     DISTRIB_RELEASE=9.04
+    #     DISTRIB_CODENAME=jaunty
+    #     DISTRIB_DESCRIPTION="Ubuntu 9.04"
+    # Try lsb-release first. But on some older systems lsb-release is not informative.
+    # It may contain just one line:
+    #     LSB_VERSION="1.3"
+    $release = "/etc/lsb-release";
+    if ( -e $release ) {
+        $bulk = read_file( $release );
+    } else {
+        $bulk = "";
+    }; # if
+    if ( $bulk =~ m{^DISTRIB_} ) {
+        # Ok, this lsb-release is informative.
+        $bulk =~ m{^DISTRIB_ID\s*=\s*(.*?)\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_ID:", $bulk, "(eof)" );
+        $values{ operating_system_name } = $1;
+        $bulk =~ m{^DISTRIB_RELEASE\s*=\s*(.*?)\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_RELEASE:", $bulk, "(eof)" );
+        $values{ operating_system_release } = $1;
+        $bulk =~ m{^DISTRIB_CODENAME\s*=\s*(.*?)\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_CODENAME:", $bulk, "(eof)" );
+        $values{ operating_system_codename } = $1;
+        $bulk =~ m{^DISTRIB_DESCRIPTION\s*="?\s*(.*?)"?\s*$}m
+            or runtime_error( "$release: There is no DISTRIB_DESCRIPTION:", $bulk, "(eof)" );
+        $values{ operating_system_description } = $1;
+    } else {
+        # Oops. lsb-release is missed or not informative. Try other *-release files.
+        $release = "/etc/system-release";
+        if ( not -e $release ) {    # Use /etc/system-release" if such file exists.
+            # Otherwise try other "/etc/*-release" files, but ignore "/etc/lsb-release".
+            my @releases = grep( $_ ne "/etc/lsb-release", bsd_glob( "/etc/*-release" ) );
+            # On some Fedora systems there are two files: fedora-release and redhat-release
+            # with identical content. If fedora-release present, ignore redjat-release.
+            if ( grep( $_ eq "/etc/fedora-release", @releases ) ) {
+                @releases = grep( $_ ne "/etc/redhat-release", @releases );
+            }; # if
+            if ( @releases == 1 ) {
+                $release = $releases[ 0 ];
+            } else {
+                if ( @releases == 0 ) {
+                    # No *-release files found, try debian_version.
+                    $release = "/etc/debian_version";
+                    if ( not -e $release ) {
+                        $release = undef;
+                        warning( "No release files found in \"/etc/\" directory." );
+                    }; # if
+                } else {
+                    $release = undef;
+                    warning( "More than one release files found in \"/etc/\" directory:", @releases );
+                }; # if
+            }; # if
+        }; # if
+        if ( defined( $release ) ) {
+            $bulk = read_file( $release );
+            if ( $release =~ m{system|redhat|fedora} ) {
+                # Red Hat or Fedora. Parse the first line of file.
+                # Typical values of *-release (one of):
+                #     Red Hat Enterprise Linux* OS Server release 5.2 (Tikanga)
+                #     Red Hat Enterprise Linux* OS AS release 3 (Taroon Update 4)
+                #     Fedora release 10 (Cambridge)
+                $bulk =~ m{\A(.*)$}m
+                    or runtime_error( "$release: Cannot find the first line:", $bulk, "(eof)" );
+                my $first_line = $1;
+                $values{ operating_system_description } = $first_line;
+                $first_line =~ m{\A(.*?)\s+release\s+(.*?)(?:\s+\((.*?)(?:\s+Update\s+(.*?))?\))?\s*$}
+                    or runtime_error( "$release:1: Cannot parse line:", $first_line );
+                $values{ operating_system_name    }  = $1;
+                $values{ operating_system_release }  = $2 . ( defined( $4 ) ? ".$4" : "" );
+                $values{ operating_system_codename } = $3;
+            } elsif ( $release =~ m{SuSE} ) {
+                # Typical SuSE-release:
+                #     SUSE Linux* OS Enterprise Server 10 (x86_64)
+                #     VERSION = 10
+                #     PATCHLEVEL = 2
+                $bulk =~ m{\A(.*)$}m
+                    or runtime_error( "$release: Cannot find the first line:", $bulk, "(eof)" );
+                my $first_line = $1;
+                $values{ operating_system_description } = $first_line;
+                $first_line =~ m{^(.*?)\s*(\d+)\s*\(.*?\)\s*$}
+                    or runtime_error( "$release:1: Cannot parse line:", $first_line );
+                $values{ operating_system_name } = $1;
+                $bulk =~ m{^VERSION\s*=\s*(.*)\s*$}m
+                    or runtime_error( "$release: There is no VERSION:", $bulk, "(eof)" );
+                $values{ operating_system_release } = $1;
+                if ( $bulk =~ m{^PATCHLEVEL\s*=\s*(.*)\s*$}m ) {
+                    $values{ operating_system_release } .= ".$1";
+                }; # if
+            } elsif ( $release =~ m{debian_version} ) {
+                # Debian. The file debian_version contains just version number, nothing more:
+                #     4.0
+                my $name = "Debian";
+                $bulk =~ m{\A(.*)$}m
+                    or runtime_error( "$release: Cannot find the first line:", $bulk, "(eof)" );
+                my $version = $1;
+                $values{ operating_system_name        } = $name;
+                $values{ operating_system_release     } = $version;
+                $values{ operating_system_codename    } = "unknown";
+                $values{ operating_system_description } = sprintf( "%s %s", $name, $version );
+            }; # if
+        }; # if
+    }; # if
+    if ( not defined( $values{ operating_system_name } ) ) {
+        $values{ operating_system_name } = "GNU/Linux";
+    }; # if
+} elsif ( $values{ kernel_name } eq "Darwin" ) {
+    my %codenames = (
+        10.4 => "Tiger",
+        10.5 => "Leopard",
+        10.6 => "Snow Leopard",
+    );
+   my $darwin;
+   my $get_os_info =
+       sub {
+           my ( $name ) = @_;
+           if ( not defined $darwin ) {
+               $darwin->{ operating_system } = "Darwin";
+               # sw_vers prints OS X* version to stdout:
+               #     ProductName:       OS X*
+               #     ProductVersion:    10.4.11
+               #     BuildVersion:      8S2167
+               # It does not print codename, so we code OS X* codenames here.
+               my $sw_vers = which( "sw_vers" ) || "/usr/bin/sw_vers";
+               my $output;
+               debug( "Executing $sw_vers..." );
+               execute( [ $sw_vers ], -stdout => \$output, -stderr => undef );
+               $output =~ m{^ProductName:\s*(.*)\s*$}m
+                   or runtime_error( "There is no ProductName in sw_vers output:", $output, "(eof)" );
+               my $name = $1;
+               $output =~ m{^ProductVersion:\s*(.*)\s*$}m
+                   or runtime_error( "There is no ProductVersion in sw_vers output:", $output, "(eof)" );
+               my $release = $1;
+               # Sometimes release reported as "10.4.11" (3 componentes), sometimes as "10.6".
+               # Handle both variants.
+               $release =~ m{^(\d+.\d+)(?:\.\d+)?(?=\s|$)}
+                   or runtime_error( "Cannot parse OS X* version: $release" );
+               my $version = $1;
+               my $codename = ( $codenames{ $version } or "unknown" );
+               $darwin->{ operating_system_name        } = $name;
+               $darwin->{ operating_system_release     } = $release;
+               $darwin->{ operating_system_codename    } = $codename;
+               $darwin->{ operating_system_description } = sprintf( "%s %s (%s)", $name, $release, $codename );
+           }; # if
+           return $darwin->{ $name };
+       }; # sub
+    $values{ operating_system             } = sub { $get_os_info->( "operating_system"             ); };
+    $values{ operating_system_name        } = sub { $get_os_info->( "operating_system_name"        ); };
+    $values{ operating_system_release     } = sub { $get_os_info->( "operating_system_release"     ); };
+    $values{ operating_system_codename    } = sub { $get_os_info->( "operating_system_codename"    ); };
+    $values{ operating_system_description } = sub { $get_os_info->( "operating_system_description" ); };
+} elsif ( $values{ kernel_name } =~ m{\AWindows[ _]NT\z} ) {
+    $values{ operating_system } = "MS Windows";
+    # my @os_name = Win32::GetOSName();
+    # $values{ operating_system_release } = $os_name[ 0 ];
+    # $values{ operating_system_update  } = $os_name[ 1 ];
+} elsif ( $values{ kernel_name } =~ m{\ACYGWIN_NT-} ) {
+    $values{ operating_system } = "MS Windows";
+} elsif ( $values{ kernel_name } =~ m{\AFreeBSD} ) {
+    $values{ operating_system } = "FreeBSD";
+} elsif ( $values{ kernel_name } =~ m{\ANetBSD} ) {
+    $values{ operating_system } = "NetBSD";
+} else {
+    die "Unsupported kernel_name (\"$values{ kernel_name }\") returned by POSIX::uname(); stopped";
+}; # if
+
+# host_name and domain_name
+$values{ host_name } =
+    sub {
+        my $fqdn = value( "fqdn" );
+        $fqdn =~ m{\A([^.]*)(?:\.(.*))?\z};
+        my $host_name = $1;
+        if ( not defined( $host_name ) or $host_name eq "" ) {
+            die "Unexpected error: undefined or empty host name; stopped";
+        }; # if
+        return $host_name;
+    };
+$values{ domain_name } =
+    sub {
+        my $fqdn = value( "fqdn" );
+        $fqdn =~ m{\A([^.]*)(?:\.(.*))?\z};
+        my $domain_name = $2;
+        if ( not defined( $domain_name ) or $domain_name eq "" ) {
+            die "Unexpected error: undefined or empty domain name; stopped";
+        }; # if
+        return $domain_name;
+    };
+
+# Replace undefined values with "unknown".
+foreach my $name ( @all ) {
+    if ( not defined( $values{ $name } ) ) {
+        $values{ $name } = "unknown";
+    }; # if
+}; # foreach $name
+
+# Export functions reporting properties.
+foreach my $name ( @all ) {
+    no strict "refs";
+    *$name = sub { return value( $name ); };
+}; # foreach $name
+
+# This function returns base names.
+sub base_names {
+    return @base;
+}; # sub base_names
+
+# This function returns all the names.
+sub all_names {
+    return @all;
+}; # sub all_names
+
+# This function returns value by the specified name.
+sub value($) {
+    my $name = shift( @_ );
+    if ( ref( $values{ $name } ) ) {
+        my $value = $values{ $name }->();
+        $values{ $name } = $value;
+    }; # if
+    return $values{ $name };
+}; # sub value
+
+return 1;
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<Uname.pm> -- A few subroutines to get system information usually provided by
+C</bin/uname> and C<POSIX::uname()>.
+
+=head1 SYNOPSIS
+
+    use Uname;
+
+    # Base property functions.
+    $kernel_name       = Uname::kernel_name();
+    $fqdn              = Uname::fqdn();
+    $kernel_release    = Uname::kernel_release();
+    $kernel_version    = Uname::kernel_version();
+    $machine           = Uname::machine();
+    $processor         = Uname::processor();
+    $hardware_platform = Uname::hardware_platform();
+    $operating_system  = Uname::operating_system();
+
+    # Auxiliary property functions.
+    $host_name         = Uname::host_name();
+    $domain_name       = Uname::domain_name();
+    $os_name           = Uname::operating_system_name();
+    $os_release        = Uname::operating_system_release();
+    $os_codename       = Uname::operating_system_codename();
+    $os_description    = Uname::operating_system_description();
+
+    # Meta functions.
+    @base_names  = Uname::base_names();
+    @all_names   = Uname::all_names();
+    $kernel_name = Uname::value( "kernel_name" );
+
+=head1 DESCRIPTION
+
+B<Uname.pm> resembles functionality found in C<POSIX::uname()> function or in C<uname> program.
+However, both C<POSIX::uname()> and C</bin/uname> have some disadvantages:
+
+=over
+
+=item *
+
+C<uname> may be not available in some environments, for example, in Windows* OS
+(C<uname> may be found in some third-party software packages, like MKS Toolkit or Cygwin, but it is
+not a part of OS).
+
+=item *
+
+There are many different versions of C<uname>. For example, C<uname> on OS X* does not
+recognize options C<-i>, C<-o>, and any long options.
+
+=item *
+
+Different versions of C<uname> may report the same property differently. For example,
+C<uname> on Linux* OS reports machine as C<i686>, while C<uname> on OS X* reports the same machine as
+C<x86>.
+
+=item *
+
+C<POSIX::uname()> returns list of values. I cannot recall what is the fourth element of the list.
+
+=back
+
+=head2 Base Functions
+
+Base property functions provide the information as C<uname> program.
+
+=over
+
+=item B<kernel_name()>
+
+Returns the kernel name, as reported by C<POSIX::uname()>.
+
+=item B<fqdn()>
+
+Returns the FQDN, fully qualified domain name. On some systems C<POSIX::uname()> reports short node
+name (with no domain name), on others C<POSIX::uname()> reports full node name. This
+function strive to return FQDN always (by refining C<POSIX::uname()> with
+C<Net::Domain::hostfqdn()>).
+
+=item B<kernel_release()>
+
+Returns the kernel release string, as reported by C<POSIX::uname()>. Usually the string consists of
+several numbers, separated by dots and dashes, but may also include some non-numeric substrings like
+"smp".
+
+=item B<kernel_version()>
+
+Returns the kernel version string, as reported by C<POSIX::uname()>. It is B<not> several
+dot-separated numbers but much longer string describing the kernel.
+For example, on Linux* OS it includes build date.
+If you look for something identifying the kernel, look at L<kernel_release>.
+
+=item B<machine()>
+
+Returns the machine hardware name, as reported by POSIX::uname(). Not reliable. Different OSes may
+report the same machine hardware name differently. For example, Linux* OS reports C<i686>, while OS X*
+reports C<x86> on the same machine.
+
+=item B<processor()>
+
+Returns the processor type. Not reliable. Usually the same as C<machine>.
+
+=item B<hardware_platform()>
+
+One of: C<i386> or C<x86_64>.
+
+=item B<operating_system()>
+
+One of: C<GNU/Linux>, C<OS X*>, or C<MS Windows>.
+
+=back
+
+=head2 Auxiliary Functions
+
+Auxiliary functions extends base functions with information not reported by C<uname> program.
+
+Auxiliary functions collect information from different sources. For example, on OS X*, they may
+call C<sw_vers> program to find out OS release; on Linux* OS they may parse C</etc/redhat-release> file,
+etc.
+
+=over
+
+=item B<host_name()>
+
+Returns host name (FQDN with dropped domain part).
+
+=item B<domain_name()>
+
+Returns domain name (FQDN with dropped host part).
+
+=item B<operating_system_name>
+
+Name of operating system or name of Linux* OS distribution, like "Fedora" or
+"Red Hat Enterprise Linux* OS Server".
+
+=item B<operating_system_release>
+
+Release (version) of operating system or Linux* OS distribution. Usually it is a series of
+dot-separated numbers.
+
+=item B<operating_system_codename>
+
+Codename of operating system release or Linux* OS distribution. For example, Fedora 10 is "Cambridge"
+while OS X* 10.4 is "Tiger".
+
+=item B<operating_system_description>
+
+Longer string. Usually it includes all the operating system properting mentioned above -- name,
+release, codename in parentheses.
+
+=back
+
+=head2 Meta Functions
+
+=over
+
+=item B<base_names()>
+
+This function returns the list of base property names.
+
+=item B<all_names()>
+
+This function returns the list of all property names.
+
+=item B<value(> I<name> B<)>
+
+This function returns the value of the property specified by I<name>.
+
+=back
+
+=head1 EXAMPLES
+
+    use Uname;
+
+    print( Uname::string(), "\n" );
+
+    foreach my $name ( Uname::all_names() ) {
+        print( "$name=\"" . Uname::value( $name ) . "\"\n" );
+    }; # foreach $name
+
+=head1 SEE ALSO
+
+L<POSIX::uname>, L<uname>.
+
+=cut
+
+# end of file #
+
diff --git a/final/runtime/tools/lib/tools.pm b/final/runtime/tools/lib/tools.pm
new file mode 100644
index 0000000..cbed636
--- /dev/null
+++ b/final/runtime/tools/lib/tools.pm
@@ -0,0 +1,1980 @@
+#
+# This is not a runnable script, it is a Perl module, a collection of variables, subroutines, etc.
+# to be used in other scripts.
+#
+# To get help about exported variables and subroutines, please execute the following command:
+#
+#     perldoc tools.pm
+#
+# or see POD (Plain Old Documentation) imbedded to the source...
+#
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+=head1 NAME
+
+B<tools.pm> -- A collection of subroutines which are widely used in Perl scripts.
+
+=head1 SYNOPSIS
+
+    use FindBin;
+    use lib "$FindBin::Bin/lib";
+    use tools;
+
+=head1 DESCRIPTION
+
+B<Note:> Because this collection is small and intended for widely using in particular project,
+all variables and functions are exported by default.
+
+B<Note:> I have some ideas how to improve this collection, but it is in my long-term plans.
+Current shape is not ideal, but good enough to use.
+
+=cut
+
+package tools;
+
+use strict;
+use warnings;
+
+use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS );
+require Exporter;
+@ISA = qw( Exporter );
+
+my @vars   = qw( $tool );
+my @utils  = qw( check_opts validate );
+my @opts   = qw( get_options );
+my @print  = qw( debug info warning cmdline_error runtime_error question );
+my @name   = qw( get_vol get_dir get_file get_name get_ext cat_file cat_dir );
+my @file   = qw( which abs_path rel_path real_path make_dir clean_dir copy_dir move_dir del_dir change_dir copy_file move_file del_file );
+my @io     = qw( read_file write_file );
+my @exec   = qw( execute backticks );
+my @string = qw{ pad };
+@EXPORT = ( @utils, @opts, @vars, @print, @name, @file, @io, @exec, @string );
+
+use UNIVERSAL    ();
+
+use FindBin;
+use IO::Handle;
+use IO::File;
+use IO::Dir;
+# Not available on some machines: use IO::Zlib;
+
+use Getopt::Long ();
+use Pod::Usage   ();
+use Carp         ();
+use File::Copy   ();
+use File::Path   ();
+use File::Temp   ();
+use File::Spec   ();
+use POSIX        qw{ :fcntl_h :errno_h };
+use Cwd          ();
+use Symbol       ();
+
+use Data::Dumper;
+
+use vars qw( $tool $verbose $timestamps );
+$tool = $FindBin::Script;
+
+my @warning = ( sub {}, \&warning, \&runtime_error );
+
+
+sub check_opts(\%$;$) {
+
+    my $opts = shift( @_ );  # Referense to hash containing real options and their values.
+    my $good = shift( @_ );  # Reference to an array containing all known option names.
+    my $msg  = shift( @_ );  # Optional (non-mandatory) message.
+
+    if ( not defined( $msg ) ) {
+        $msg = "unknown option(s) passed";   # Default value for $msg.
+    }; # if
+
+    # I'll use these hashes as sets of options.
+    my %good = map( ( $_ => 1 ), @$good );   # %good now is filled with all known options.
+    my %bad;                                 # %bad is empty.
+
+    foreach my $opt ( keys( %$opts ) ) {     # For each real option...
+        if ( not exists( $good{ $opt } ) ) { # Look its name in the set of known options...
+            $bad{ $opt } = 1;                # Add unknown option to %bad set.
+            delete( $opts->{ $opt } );       # And delete original option.
+        }; # if
+    }; # foreach $opt
+    if ( %bad ) {                            # If %bad set is not empty...
+        my @caller = caller( 1 );            # Issue a warning.
+        local $Carp::CarpLevel = 2;
+        Carp::cluck( $caller[ 3 ] . ": " . $msg . ": " . join( ", ", sort( keys( %bad ) ) ) );
+    }; # if
+
+    return 1;
+
+}; # sub check_opts
+
+
+# --------------------------------------------------------------------------------------------------
+# Purpose:
+#     Check subroutine arguments.
+# Synopsis:
+#     my %opts = validate( params => \@_, spec => { ... }, caller => n );
+# Arguments:
+#     params -- A reference to subroutine's actual arguments.
+#     spec   -- Specification of expected arguments.
+#     caller -- ...
+# Return value:
+#     A hash of validated options.
+# Description:
+#     I would like to use Params::Validate module, but it is not a part of default Perl
+#     distribution, so I cannot rely on it. This subroutine resembles to some extent to
+#     Params::Validate::validate_with().
+#     Specification of expected arguments:
+#        { $opt => { type => $type, default => $default }, ... }
+#        $opt     -- String, option name.
+#        $type    -- String, expected type(s). Allowed values are "SCALAR", "UNDEF", "BOOLEAN",
+#                    "ARRAYREF", "HASHREF", "CODEREF". Multiple types may listed using bar:
+#                    "SCALAR|ARRAYREF". The type string is case-insensitive.
+#        $default -- Default value for an option. Will be used if option is not specified or
+#                    undefined.
+#
+sub validate(@) {
+
+    my %opts = @_;    # Temporary use %opts for parameters of `validate' subroutine.
+    my $params = $opts{ params };
+    my $caller = ( $opts{ caller } or 0 ) + 1;
+    my $spec   = $opts{ spec };
+    undef( %opts );   # Ok, Clean %opts, now we will collect result of the subroutine.
+
+    # Find out caller package, filename, line, and subroutine name.
+    my ( $pkg, $file, $line, $subr ) = caller( $caller );
+    my @errors;    # We will collect errors in array not to stop on the first found error.
+    my $error =
+        sub ($) {
+            my $msg = shift( @_ );
+            push( @errors, "$msg at $file line $line.\n" );
+        }; # sub
+
+    # Check options.
+    while ( @$params ) {
+        # Check option name.
+        my $opt = shift( @$params );
+        if ( not exists( $spec->{ $opt } ) ) {
+            $error->( "Invalid option `$opt'" );
+            shift( @$params ); # Skip value of unknow option.
+            next;
+        }; # if
+        # Check option value exists.
+        if ( not @$params ) {
+            $error->( "Option `$opt' does not have a value" );
+            next;
+        }; # if
+        my $val = shift( @$params );
+        # Check option value type.
+        if ( exists( $spec->{ $opt }->{ type } ) ) {
+            # Type specification exists. Check option value type.
+            my $actual_type;
+            if ( ref( $val ) ne "" ) {
+                $actual_type = ref( $val ) . "REF";
+            } else {
+                $actual_type = ( defined( $val ) ? "SCALAR" : "UNDEF" );
+            }; # if
+            my @wanted_types = split( m{\|}, lc( $spec->{ $opt }->{ type } ) );
+            my $wanted_types = join( "|", map( $_ eq "boolean" ? "scalar|undef" : quotemeta( $_ ), @wanted_types ) );
+            if ( $actual_type !~ m{\A(?:$wanted_types)\z}i ) {
+                $actual_type = lc( $actual_type );
+                $wanted_types = lc( join( " or ", map( "`$_'", @wanted_types ) ) );
+                $error->( "Option `$opt' value type is `$actual_type' but expected to be $wanted_types" );
+                next;
+            }; # if
+        }; # if
+        if ( exists( $spec->{ $opt }->{ values } )  ) {
+            my $values = $spec->{ $opt }->{ values };
+            if ( not grep( $_ eq $val, @$values ) ) {
+                $values = join( ", ", map( "`$_'", @$values ) );
+                $error->( "Option `$opt' value is `$val' but expected to be one of $values" );
+                next;
+            }; # if
+        }; # if
+        $opts{ $opt } = $val;
+    }; # while
+
+    # Assign default values.
+    foreach my $opt ( keys( %$spec ) ) {
+        if ( not defined( $opts{ $opt } ) and exists( $spec->{ $opt }->{ default } ) ) {
+            $opts{ $opt } = $spec->{ $opt }->{ default };
+        }; # if
+    }; # foreach $opt
+
+    # If we found any errors, raise them.
+    if ( @errors ) {
+        die join( "", @errors );
+    }; # if
+
+    return %opts;
+
+}; # sub validate
+
+# =================================================================================================
+# Get option helpers.
+# =================================================================================================
+
+=head2 Get option helpers.
+
+=cut
+
+# -------------------------------------------------------------------------------------------------
+
+=head3 get_options
+
+B<Synopsis:>
+
+    get_options( @arguments )
+
+B<Description:>
+
+It is very simple wrapper arounf Getopt::Long::GetOptions. It passes all arguments to GetOptions,
+and add definitions for standard help options: --help, --doc, --verbose, and --quiet.
+When GetOptions finihes, this subroutine checks exit code, if it is non-zero, standard error
+message is issued and script terminated.
+
+If --verbose or --quiet option is specified, C<tools.pm_verbose> environment variable is set.
+It is the way to propagate verbose/quiet mode to callee Perl scripts.
+
+=cut
+
+sub get_options {
+
+    Getopt::Long::Configure( "no_ignore_case" );
+    Getopt::Long::GetOptions(
+        "h0|usage"        => sub { Pod::Usage::pod2usage( -exitval => 0, -verbose => 0 ); },
+        "h1|h|help"       => sub { Pod::Usage::pod2usage( -exitval => 0, -verbose => 1 ); },
+        "h2|doc|manual"   => sub { Pod::Usage::pod2usage( -exitval => 0, -verbose => 2 ); },
+        "version"         => sub { print( "$tool version $main::VERSION\n" ); exit( 0 ); },
+        "v|verbose"       => sub { ++ $verbose;     $ENV{ "tools.pm_verbose"    } = $verbose;    },
+        "quiet"           => sub { -- $verbose;     $ENV{ "tools.pm_verbose"    } = $verbose;    },
+        "with-timestamps" => sub { $timestamps = 1; $ENV{ "tools.pm_timestamps" } = $timestamps; },
+        @_, # Caller argumetsa are at the end so caller options overrides standard.
+    ) or cmdline_error();
+
+}; # sub get_options
+
+
+# =================================================================================================
+# Print utilities.
+# =================================================================================================
+
+=pod
+
+=head2 Print utilities.
+
+Each of the print subroutines prepends each line of its output with the name of current script and
+the type of information, for example:
+
+    info( "Writing file..." );
+
+will print
+
+    <script>: (i): Writing file...
+
+while
+
+    warning( "File does not exist!" );
+
+will print
+
+    <script>: (!): File does not exist!
+
+Here are exported items:
+
+=cut
+
+# -------------------------------------------------------------------------------------------------
+
+sub _format_message($\@;$) {
+
+    my $prefix  = shift( @_ );
+    my $args    = shift( @_ );
+    my $no_eol  = shift( @_ );  # Do not append "\n" to the last line.
+    my $message = "";
+
+    my $ts = "";
+    if ( $timestamps ) {
+        my ( $sec, $min, $hour, $day, $month, $year ) = gmtime();
+        $month += 1;
+        $year  += 1900;
+        $ts = sprintf( "%04d-%02d-%02d %02d:%02d:%02d UTC: ", $year, $month, $day, $hour, $min, $sec );
+    }; # if
+    for my $i ( 1 .. @$args ) {
+        my @lines = split( "\n", $args->[ $i - 1 ] );
+        for my $j ( 1 .. @lines ) {
+            my $line = $lines[ $j - 1 ];
+            my $last_line = ( ( $i == @$args ) and ( $j == @lines ) );
+            my $eol = ( ( substr( $line, -1 ) eq "\n" ) or defined( $no_eol ) ? "" : "\n" );
+            $message .= "$ts$tool: ($prefix) " . $line . $eol;
+        }; # foreach $j
+    }; # foreach $i
+    return $message;
+
+}; # sub _format_message
+
+#--------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 $verbose
+
+B<Synopsis:>
+
+    $verbose
+
+B<Description:>
+
+Package variable. It determines verbosity level, which affects C<warning()>, C<info()>, and
+C<debug()> subroutnes .
+
+The variable gets initial value from C<tools.pm_verbose> environment variable if it is exists.
+If the environment variable does not exist, variable is set to 2.
+
+Initial value may be overridden later directly or by C<get_options> function.
+
+=cut
+
+$verbose = exists( $ENV{ "tools.pm_verbose" } ) ? $ENV{ "tools.pm_verbose" } : 2;
+
+#--------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 $timestamps
+
+B<Synopsis:>
+
+    $timestamps
+
+B<Description:>
+
+Package variable. It determines whether C<debug()>, C<info()>, C<warning()>, C<runtime_error()>
+subroutnes print timestamps or not.
+
+The variable gets initial value from C<tools.pm_timestamps> environment variable if it is exists.
+If the environment variable does not exist, variable is set to false.
+
+Initial value may be overridden later directly or by C<get_options()> function.
+
+=cut
+
+$timestamps = exists( $ENV{ "tools.pm_timestamps" } ) ? $ENV{ "tools.pm_timestamps" } : 0;
+
+# -------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 debug
+
+B<Synopsis:>
+
+    debug( @messages )
+
+B<Description:>
+
+If verbosity level is 3 or higher, print debug information to the stderr, prepending it with "(#)"
+prefix.
+
+=cut
+
+sub debug(@) {
+
+    if ( $verbose >= 3 ) {
+        STDOUT->flush();
+        STDERR->print( _format_message( "#", @_ ) );
+    }; # if
+    return 1;
+
+}; # sub debug
+
+#--------------------------------------------------------------------------------------------------
+
+=pod
+
+=head3 info
+
+B<Synopsis:>
+
+    info( @messages )
+
+B<Description:>
+
+If verbosity level is 2 or higher, print information to the stderr, prepending it with "(i)" prefix.
+
+=cut
+
+sub info(@) {
+
+    if ( $verbose >= 2 ) {
+        STDOUT->flush();
+        STDERR->print( _format_message( "i", @_  ) );
+    }; # if
+
+}; # sub info
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 warning
+
+B<Synopsis:>
+
+    warning( @messages )
+
+B<Description:>
+
+If verbosity level is 1 or higher, issue a warning, prepending it with "(!)" prefix.
+
+=cut
+
+sub warning(@) {
+
+    if ( $verbose >= 1 ) {
+        STDOUT->flush();
+        warn( _format_message( "!", @_  ) );
+    }; # if
+
+}; # sub warning
+
+# -------------------------------------------------------------------------------------------------
+
+=head3 cmdline_error
+
+B<Synopsis:>
+
+    cmdline_error( @message )
+
+B<Description:>
+
+Print error message and exit the program with status 2.
+
+This function is intended to complain on command line errors, e. g. unknown
+options, invalid arguments, etc.
+
+=cut
+
+sub cmdline_error(;$) {
+
+    my $message = shift( @_ );
+
+    if ( defined( $message ) ) {
+        if ( substr( $message, -1, 1 ) ne "\n" ) {
+            $message .= "\n";
+        }; # if
+    } else {
+        $message = "";
+    }; # if
+    STDOUT->flush();
+    die $message . "Try --help option for more information.\n";
+
+}; # sub cmdline_error
+
+# -------------------------------------------------------------------------------------------------
+
+=head3 runtime_error
+
+B<Synopsis:>
+
+    runtime_error( @message )
+
+B<Description:>
+
+Print error message and exits the program with status 3.
+
+This function is intended to complain on runtime errors, e. g.
+directories which are not found, non-writable files, etc.
+
+=cut
+
+sub runtime_error(@) {
+
+    STDOUT->flush();
+    die _format_message( "x", @_ );
+
+}; # sub runtime_error
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 question
+
+B<Synopsis:>
+
+    question( $prompt; $answer, $choices  )
+
+B<Description:>
+
+Print $promp to the stderr, prepending it with "question:" prefix. Read a line from stdin, chop
+"\n" from the end, it is answer.
+
+If $answer is defined, it is treated as first user input.
+
+If $choices is specified, it could be a regexp for validating user input, or a string. In latter
+case it interpreted as list of characters, acceptable (case-insensitive) choices. If user enters
+non-acceptable answer, question continue asking until answer is acceptable.
+If $choices is not specified, any answer is acceptable.
+
+In case of end-of-file (or Ctrl+D pressed by user), $answer is C<undef>.
+
+B<Examples:>
+
+    my $answer;
+    question( "Save file [yn]? ", $answer, "yn" );
+        # We accepts only "y", "Y", "n", or "N".
+    question( "Press enter to continue or Ctrl+C to abort..." );
+        # We are not interested in answer value -- in case of Ctrl+C the script will be terminated,
+        # otherwise we continue execution.
+    question( "File name? ", $answer );
+        # Any answer is acceptable.
+
+=cut
+
+sub question($;\$$) {
+
+    my $prompt  = shift( @_ );
+    my $answer  = shift( @_ );
+    my $choices = shift( @_ );
+    my $a       = ( defined( $answer ) ? $$answer : undef );
+
+    if ( ref( $choices ) eq "Regexp" ) {
+        # It is already a regular expression, do nothing.
+    } elsif ( defined( $choices ) ) {
+        # Convert string to a regular expression.
+        $choices = qr/[@{ [ quotemeta( $choices ) ] }]/i;
+    }; # if
+
+    for ( ; ; ) {
+        STDERR->print( _format_message( "?", @{ [ $prompt ] }, "no_eol" ) );
+        STDERR->flush();
+        if ( defined( $a ) ) {
+            STDOUT->print( $a . "\n" );
+        } else {
+            $a = <STDIN>;
+        }; # if
+        if ( not defined( $a ) ) {
+            last;
+        }; # if
+        chomp( $a );
+        if ( not defined( $choices ) or ( $a =~ m/^$choices$/ ) ) {
+            last;
+        }; # if
+        $a = undef;
+    }; # forever
+    if ( defined( $answer ) ) {
+        $$answer = $a;
+    }; # if
+
+}; # sub question
+
+# -------------------------------------------------------------------------------------------------
+
+# Returns volume part of path.
+sub get_vol($) {
+
+    my $path = shift( @_ );
+    my ( $vol, undef, undef ) = File::Spec->splitpath( $path );
+    return $vol;
+
+}; # sub get_vol
+
+# Returns directory part of path.
+sub get_dir($) {
+
+    my $path = File::Spec->canonpath( shift( @_ ) );
+    my ( $vol, $dir, undef ) = File::Spec->splitpath( $path );
+    my @dirs = File::Spec->splitdir( $dir );
+    pop( @dirs );
+    $dir = File::Spec->catdir( @dirs );
+    $dir = File::Spec->catpath( $vol, $dir, undef );
+    return $dir;
+
+}; # sub get_dir
+
+# Returns file part of path.
+sub get_file($) {
+
+    my $path = shift( @_ );
+    my ( undef, undef, $file ) = File::Spec->splitpath( $path );
+    return $file;
+
+}; # sub get_file
+
+# Returns file part of path without last suffix.
+sub get_name($) {
+
+    my $path = shift( @_ );
+    my ( undef, undef, $file ) = File::Spec->splitpath( $path );
+    $file =~ s{\.[^.]*\z}{};
+    return $file;
+
+}; # sub get_name
+
+# Returns last suffix of file part of path.
+sub get_ext($) {
+
+    my $path = shift( @_ );
+    my ( undef, undef, $file ) = File::Spec->splitpath( $path );
+    my $ext = "";
+    if ( $file =~ m{(\.[^.]*)\z} ) {
+        $ext = $1;
+    }; # if
+    return $ext;
+
+}; # sub get_ext
+
+sub cat_file(@) {
+
+    my $path = shift( @_ );
+    my $file = pop( @_ );
+    my @dirs = @_;
+
+    my ( $vol, $dirs ) = File::Spec->splitpath( $path, "no_file" );
+    @dirs = ( File::Spec->splitdir( $dirs ), @dirs );
+    $dirs = File::Spec->catdir( @dirs );
+    $path = File::Spec->catpath( $vol, $dirs, $file );
+
+    return $path;
+
+}; # sub cat_file
+
+sub cat_dir(@) {
+
+    my $path = shift( @_ );
+    my @dirs = @_;
+
+    my ( $vol, $dirs ) = File::Spec->splitpath( $path, "no_file" );
+    @dirs = ( File::Spec->splitdir( $dirs ), @dirs );
+    $dirs = File::Spec->catdir( @dirs );
+    $path = File::Spec->catpath( $vol, $dirs, "" );
+
+    return $path;
+
+}; # sub cat_dir
+
+# =================================================================================================
+# File and directory manipulation subroutines.
+# =================================================================================================
+
+=head2 File and directory manipulation subroutines.
+
+=over
+
+=cut
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<which( $file, @options )>
+
+Searches for specified executable file in the (specified) directories.
+Raises a runtime eroror if no executable file found. Returns a full path of found executable(s).
+
+Options:
+
+=over
+
+=item C<-all> =E<gt> I<bool>
+
+Do not stop on the first found file. Note, that list of full paths is returned in this case.
+
+=item C<-dirs> =E<gt> I<ref_to_array>
+
+Specify directory list to search through. If option is not passed, PATH environment variable
+is used for directory list.
+
+=item C<-exec> =E<gt> I<bool>
+
+Whether check for executable files or not. By default, C<which> searches executable files.
+However, on Cygwin executable check never performed.
+
+=back
+
+Examples:
+
+Look for "echo" in the directories specified in PATH:
+
+    my $echo = which( "echo" );
+
+Look for all occurenses of "cp" in the PATH:
+
+    my @cps = which( "cp", -all => 1 );
+
+Look for the first occurrence of "icc" in the specified directories:
+
+    my $icc = which( "icc", -dirs => [ ".", "/usr/local/bin", "/usr/bin", "/bin" ] );
+
+Look for the the C<omp_lib.f> file:
+
+    my @omp_lib = which( "omp_lib.f", -all => 1, -exec => 0, -dirs => [ @include ] );
+
+=cut
+
+sub which($@) {
+
+    my $file = shift( @_ );
+    my %opts = @_;
+
+    check_opts( %opts, [ qw( -all -dirs -exec ) ] );
+    if ( $opts{ -all } and not wantarray() ) {
+        local $Carp::CarpLevel = 1;
+        Carp::cluck( "`-all' option passed to `which' but list is not expected" );
+    }; # if
+    if ( not defined( $opts{ -exec } ) ) {
+        $opts{ -exec } = 1;
+    }; # if
+
+    my $dirs = ( exists( $opts{ -dirs } ) ? $opts{ -dirs } : [ File::Spec->path() ] );
+    my @found;
+
+    my @exts = ( "" );
+    if ( $^O eq "MSWin32" and $opts{ -exec } ) {
+        if ( defined( $ENV{ PATHEXT } ) ) {
+            push( @exts, split( ";", $ENV{ PATHEXT } ) );
+        } else {
+            # If PATHEXT does not exist, use default value.
+            push( @exts, qw{ .COM .EXE .BAT .CMD } );
+        }; # if
+    }; # if
+
+    loop:
+    foreach my $dir ( @$dirs ) {
+        foreach my $ext ( @exts ) {
+            my $path = File::Spec->catfile( $dir, $file . $ext );
+            if ( -e $path ) {
+                # Executable bit is not reliable on Cygwin, do not check it.
+                if ( not $opts{ -exec } or -x $path or $^O eq "cygwin" ) {
+                    push( @found, $path );
+                    if ( not $opts{ -all } ) {
+                        last loop;
+                    }; # if
+                }; # if
+            }; # if
+        }; # foreach $ext
+    }; # foreach $dir
+
+    if ( not @found ) {
+        # TBD: We need to introduce an option for conditional enabling this error.
+        # runtime_error( "Could not find \"$file\" executable file in PATH." );
+    }; # if
+    if ( @found > 1 ) {
+        # TBD: Issue a warning?
+    }; # if
+
+    if ( $opts{ -all } ) {
+        return @found;
+    } else {
+        return $found[ 0 ];
+    }; # if
+
+}; # sub which
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<abs_path( $path, $base )>
+
+Return absolute path for an argument.
+
+Most of the work is done by C<File::Spec->rel2abs()>. C<abs_path()> additionally collapses
+C<dir1/../dir2> to C<dir2>.
+
+It is not so naive and made intentionally. For example on Linux* OS in Bash if F<link/> is a symbolic
+link to directory F<some_dir/>
+
+    $ cd link
+    $ cd ..
+
+brings you back to F<link/>'s parent, not to parent of F<some_dir/>,
+
+=cut
+
+sub abs_path($;$) {
+
+    my ( $path, $base ) = @_;
+    $path = File::Spec->rel2abs( $path, ( defined( $base ) ? $base : $ENV{ PWD } ) );
+    my ( $vol, $dir, $file ) = File::Spec->splitpath( $path );
+    while ( $dir =~ s{/(?!\.\.)[^/]*/\.\.(?:/|\z)}{/} ) {
+    }; # while
+    $path = File::Spec->canonpath( File::Spec->catpath( $vol, $dir, $file ) );
+    return $path;
+
+}; # sub abs_path
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<rel_path( $path, $base )>
+
+Return relative path for an argument.
+
+=cut
+
+sub rel_path($;$) {
+
+    my ( $path, $base ) = @_;
+    $path = File::Spec->abs2rel( abs_path( $path ), $base );
+    return $path;
+
+}; # sub rel_path
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<real_path( $dir )>
+
+Return real absolute path for an argument. In the result all relative components (F<.> and F<..>)
+and U<symbolic links are resolved>.
+
+In most cases it is not what you want. Consider using C<abs_path> first.
+
+C<abs_path> function from B<Cwd> module works with directories only. This function works with files
+as well. But, if file is a symbolic link, function does not resolve it (yet).
+
+The function uses C<runtime_error> to raise an error if something wrong.
+
+=cut
+
+sub real_path($) {
+
+    my $orig_path = shift( @_ );
+    my $real_path;
+    my $message = "";
+    if ( not -e $orig_path ) {
+        $message = "\"$orig_path\" does not exists";
+    } else {
+        # Cwd::abs_path does not work with files, so in this case we should handle file separately.
+        my $file;
+        if ( not -d $orig_path ) {
+            ( my $vol, my $dir, $file ) = File::Spec->splitpath( File::Spec->rel2abs( $orig_path ) );
+            $orig_path = File::Spec->catpath( $vol, $dir );
+        }; # if
+        {
+            local $SIG{ __WARN__ } = sub { $message = $_[ 0 ]; };
+            $real_path = Cwd::abs_path( $orig_path );
+        };
+        if ( defined( $file ) ) {
+            $real_path = File::Spec->catfile( $real_path, $file );
+        }; # if
+    }; # if
+    if ( not defined( $real_path ) or $message ne "" ) {
+        $message =~ s/^stat\(.*\): (.*)\s+at .*? line \d+\s*\z/$1/;
+        runtime_error( "Could not find real path for \"$orig_path\"" . ( $message ne "" ? ": $message" : "" ) );
+    }; # if
+    return $real_path;
+
+}; # sub real_path
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<make_dir( $dir, @options )>
+
+Make a directory.
+
+This function makes a directory. If necessary, more than one level can be created.
+If directory exists, warning issues (the script behavior depends on value of
+C<-warning_level> option). If directory creation fails or C<$dir> exists but it is not a
+directory, error isssues.
+
+Options:
+
+=over
+
+=item C<-mode>
+
+The numeric mode for new directories, 0750 (rwxr-x---) by default.
+
+=back
+
+=cut
+
+sub make_dir($@) {
+
+    my $dir    = shift( @_ );
+    my %opts   =
+        validate(
+            params => \@_,
+            spec => {
+                parents => { type => "boolean", default => 1    },
+                mode    => { type => "scalar",  default => 0777 },
+            },
+        );
+
+    my $prefix = "Could not create directory \"$dir\"";
+
+    if ( -e $dir ) {
+        if ( -d $dir ) {
+        } else {
+            runtime_error( "$prefix: it exists, but not a directory." );
+        }; # if
+    } else {
+        eval {
+            File::Path::mkpath( $dir, 0, $opts{ mode } );
+        }; # eval
+        if ( $@ ) {
+            $@ =~ s{\s+at (?:[a-zA-Z0-9 /_.]*/)?tools\.pm line \d+\s*}{};
+            runtime_error( "$prefix: $@" );
+        }; # if
+        if ( not -d $dir ) { # Just in case, check it one more time...
+            runtime_error( "$prefix." );
+        }; # if
+    }; # if
+
+}; # sub make_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<copy_dir( $src_dir, $dst_dir, @options )>
+
+Copy directory recursively.
+
+This function copies a directory recursively.
+If source directory does not exist or not a directory, error issues.
+
+Options:
+
+=over
+
+=item C<-overwrite>
+
+Overwrite destination directory, if it exists.
+
+=back
+
+=cut
+
+sub copy_dir($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not copy directory \"$src\" to \"$dst\"";
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -d $src ) {
+        runtime_error( "$prefix: \"$src\" is not a directory." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -d $dst ) {
+            if ( $opts{ -overwrite } ) {
+                del_dir( $dst );
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a directory." );
+        }; # if
+    }; # if
+
+    execute( [ "cp", "-R", $src, $dst ] );
+
+}; # sub copy_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<move_dir( $src_dir, $dst_dir, @options )>
+
+Move directory.
+
+Options:
+
+=over
+
+=item C<-overwrite>
+
+Overwrite destination directory, if it exists.
+
+=back
+
+=cut
+
+sub move_dir($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not copy directory \"$src\" to \"$dst\"";
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -d $src ) {
+        runtime_error( "$prefix: \"$src\" is not a directory." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -d $dst ) {
+            if ( $opts{ -overwrite } ) {
+                del_dir( $dst );
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a directory." );
+        }; # if
+    }; # if
+
+    execute( [ "mv", $src, $dst ] );
+
+}; # sub move_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<clean_dir( $dir, @options )>
+
+Clean a directory: delete all the entries (recursively), but leave the directory.
+
+Options:
+
+=over
+
+=item C<-force> => bool
+
+If a directory is not writable, try to change permissions first, then clean it.
+
+=item C<-skip> => regexp
+
+Regexp. If a directory entry mached the regexp, it is skipped, not deleted. (As a subsequence,
+a directory containing skipped entries is not deleted.)
+
+=back
+
+=cut
+
+sub _clean_dir($);
+
+sub _clean_dir($) {
+    our %_clean_dir_opts;
+    my ( $dir ) = @_;
+    my $skip    = $_clean_dir_opts{ skip };    # Regexp.
+    my $skipped = 0;                           # Number of skipped files.
+    my $prefix  = "Cleaning `$dir' failed:";
+    my @stat    = stat( $dir );
+    my $mode    = $stat[ 2 ];
+    if ( not @stat ) {
+        runtime_error( $prefix, "Cannot stat `$dir': $!" );
+    }; # if
+    if ( not -d _ ) {
+        runtime_error( $prefix, "It is not a directory." );
+    }; # if
+    if ( not -w _ ) {        # Directory is not writable.
+        if ( not -o _ or not $_clean_dir_opts{ force } ) {
+            runtime_error( $prefix, "Directory is not writable." );
+        }; # if
+        # Directory is not writable but mine. Try to change permissions.
+        chmod( $mode | S_IWUSR, $dir )
+            or runtime_error( $prefix, "Cannot make directory writable: $!" );
+    }; # if
+    my $handle   = IO::Dir->new( $dir ) or runtime_error( $prefix, "Cannot read directory: $!" );
+    my @entries  = File::Spec->no_upwards( $handle->read() );
+    $handle->close() or runtime_error( $prefix, "Cannot read directory: $!" );
+    foreach my $entry ( @entries ) {
+        my $path = cat_file( $dir, $entry );
+        if ( defined( $skip ) and $entry =~ $skip ) {
+            ++ $skipped;
+        } else {
+            if ( -l $path ) {
+                unlink( $path ) or runtime_error( $prefix, "Cannot delete symlink `$path': $!" );
+            } else {
+                stat( $path ) or runtime_error( $prefix, "Cannot stat `$path': $! " );
+                if ( -f _ ) {
+                    del_file( $path );
+                } elsif ( -d _ ) {
+                    my $rc = _clean_dir( $path );
+                    if ( $rc == 0 ) {
+                        rmdir( $path ) or runtime_error( $prefix, "Cannot delete directory `$path': $!" );
+                    }; # if
+                    $skipped += $rc;
+                } else {
+                    runtime_error( $prefix, "`$path' is neither a file nor a directory." );
+                }; # if
+            }; # if
+        }; # if
+    }; # foreach
+    return $skipped;
+}; # sub _clean_dir
+
+
+sub clean_dir($@) {
+    my $dir  = shift( @_ );
+    our %_clean_dir_opts;
+    local %_clean_dir_opts =
+        validate(
+            params => \@_,
+            spec => {
+                skip  => { type => "regexpref" },
+                force => { type => "boolean"   },
+            },
+        );
+    my $skipped = _clean_dir( $dir );
+    return $skipped;
+}; # sub clean_dir
+
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<del_dir( $dir, @options )>
+
+Delete a directory recursively.
+
+This function deletes a directory. If directory can not be deleted or it is not a directory, error
+message issues (and script exists).
+
+Options:
+
+=over
+
+=back
+
+=cut
+
+sub del_dir($@) {
+
+    my $dir  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Deleting directory \"$dir\" failed";
+    our %_clean_dir_opts;
+    local %_clean_dir_opts =
+        validate(
+            params => \@_,
+            spec => {
+                force => { type => "boolean" },
+            },
+        );
+
+    if ( not -e $dir ) {
+        # Nothing to do.
+        return;
+    }; # if
+    if ( not -d $dir ) {
+        runtime_error( "$prefix: it is not a directory." );
+    }; # if
+    _clean_dir( $dir );
+    rmdir( $dir ) or runtime_error( "$prefix." );
+
+}; # sub del_dir
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<change_dir( $dir )>
+
+Change current directory.
+
+If any error occurred, error issues and script exits.
+
+=cut
+
+sub change_dir($) {
+
+    my $dir = shift( @_ );
+
+    Cwd::chdir( $dir )
+        or runtime_error( "Could not chdir to \"$dir\": $!" );
+
+}; # sub change_dir
+
+
+# -------------------------------------------------------------------------------------------------
+
+=item C<copy_file( $src_file, $dst_file, @options )>
+
+Copy file.
+
+This function copies a file. If source does not exist or is not a file, error issues.
+
+Options:
+
+=over
+
+=item C<-overwrite>
+
+Overwrite destination file, if it exists.
+
+=back
+
+=cut
+
+sub copy_file($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not copy file \"$src\" to \"$dst\"";
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -f $src ) {
+        runtime_error( "$prefix: \"$src\" is not a file." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -f $dst ) {
+            if ( $opts{ -overwrite } ) {
+                del_file( $dst );
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a file." );
+        }; # if
+    }; # if
+
+    File::Copy::copy( $src, $dst ) or runtime_error( "$prefix: $!" );
+    # On Windows* OS File::Copy preserves file attributes, but on Linux* OS it doesn't.
+    # So we should do it manually...
+    if ( $^O =~ m/^linux\z/ ) {
+        my $mode = ( stat( $src ) )[ 2 ]
+            or runtime_error( "$prefix: cannot get status info for source file." );
+        chmod( $mode, $dst )
+            or runtime_error( "$prefix: cannot change mode of destination file." );
+    }; # if
+
+}; # sub copy_file
+
+# -------------------------------------------------------------------------------------------------
+
+sub move_file($$@) {
+
+    my $src  = shift( @_ );
+    my $dst  = shift( @_ );
+    my %opts = @_;
+    my $prefix = "Could not move file \"$src\" to \"$dst\"";
+
+    check_opts( %opts, [ qw( -overwrite ) ] );
+
+    if ( not -e $src ) {
+        runtime_error( "$prefix: \"$src\" does not exist." );
+    }; # if
+    if ( not -f $src ) {
+        runtime_error( "$prefix: \"$src\" is not a file." );
+    }; # if
+    if ( -e $dst ) {
+        if ( -f $dst ) {
+            if ( $opts{ -overwrite } ) {
+                #
+            } else {
+                runtime_error( "$prefix: \"$dst\" already exists." );
+            }; # if
+        } else {
+            runtime_error( "$prefix: \"$dst\" is not a file." );
+        }; # if
+    }; # if
+
+    File::Copy::move( $src, $dst ) or runtime_error( "$prefix: $!" );
+
+}; # sub move_file
+
+# -------------------------------------------------------------------------------------------------
+
+sub del_file($) {
+    my $files = shift( @_ );
+    if ( ref( $files ) eq "" ) {
+        $files = [ $files ];
+    }; # if
+    foreach my $file ( @$files ) {
+        debug( "Deleting file `$file'..." );
+        my $rc = unlink( $file );
+        if ( $rc == 0 && $! != ENOENT ) {
+            # Reporn an error, but ignore ENOENT, because the goal is achieved.
+            runtime_error( "Deleting file `$file' failed: $!" );
+        }; # if
+    }; # foreach $file
+}; # sub del_file
+
+# -------------------------------------------------------------------------------------------------
+
+=back
+
+=cut
+
+# =================================================================================================
+# File I/O subroutines.
+# =================================================================================================
+
+=head2 File I/O subroutines.
+
+=cut
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 read_file
+
+B<Synopsis:>
+
+    read_file( $file, @options )
+
+B<Description:>
+
+Read file and return its content. In scalar context function returns a scalar, in list context
+function returns list of lines.
+
+Note: If the last of file does not terminate with newline, function will append it.
+
+B<Arguments:>
+
+=over
+
+=item B<$file>
+
+A name or handle of file to read from.
+
+=back
+
+B<Options:>
+
+=over
+
+=item B<-binary>
+
+If true, file treats as a binary file: no newline conversion, no truncating trailing space, no
+newline removing performed. Entire file returned as a scalar.
+
+=item B<-bulk>
+
+This option is allowed only in binary mode. Option's value should be a reference to a scalar.
+If option present, file content placed to pointee scalar and function returns true (1).
+
+=item B<-chomp>
+
+If true, newline characters are removed from file content. By default newline characters remain.
+This option is not applicable in binary mode.
+
+=item B<-keep_trailing_space>
+
+If true, trainling space remain at the ends of lines. By default all trailing spaces are removed.
+This option is not applicable in binary mode.
+
+=back
+
+B<Examples:>
+
+Return file as single line, remove trailing spaces.
+
+    my $bulk = read_file( "message.txt" );
+
+Return file as list of lines with removed trailing space and
+newline characters.
+
+    my @bulk = read_file( "message.txt", -chomp => 1 );
+
+Read a binary file:
+
+    my $bulk = read_file( "message.txt", -binary => 1 );
+
+Read a big binary file:
+
+    my $bulk;
+    read_file( "big_binary_file", -binary => 1, -bulk => \$bulk );
+
+Read from standard input:
+
+    my @bulk = read_file( \*STDIN );
+
+=cut
+
+sub read_file($@) {
+
+    my $file = shift( @_ );  # The name or handle of file to read from.
+    my %opts = @_;           # Options.
+
+    my $name;
+    my $handle;
+    my @bulk;
+    my $error = \&runtime_error;
+
+    my @binopts = qw( -binary -error -bulk );                       # Options available in binary mode.
+    my @txtopts = qw( -binary -error -keep_trailing_space -chomp -layer ); # Options available in text (non-binary) mode.
+    check_opts( %opts, [ @binopts, @txtopts ] );
+    if ( $opts{ -binary } ) {
+        check_opts( %opts, [ @binopts ], "these options cannot be used with -binary" );
+    } else {
+        check_opts( %opts, [ @txtopts ], "these options cannot be used without -binary" );
+    }; # if
+    if ( not exists( $opts{ -error } ) ) {
+        $opts{ -error } = "error";
+    }; # if
+    if ( $opts{ -error } eq "warning" ) {
+        $error = \&warning;
+    } elsif( $opts{ -error } eq "ignore" ) {
+        $error = sub {};
+    } elsif ( ref( $opts{ -error } ) eq "ARRAY" ) {
+        $error = sub { push( @{ $opts{ -error } }, $_[ 0 ] ); };
+    }; # if
+
+    if ( ( ref( $file ) eq "GLOB" ) or UNIVERSAL::isa( $file, "IO::Handle" ) ) {
+        $name = "unknown";
+        $handle = $file;
+    } else {
+        $name = $file;
+        if ( get_ext( $file ) eq ".gz" and not $opts{ -binary } ) {
+            $handle = IO::Zlib->new( $name, "rb" );
+        } else {
+            $handle = IO::File->new( $name, "r" );
+        }; # if
+        if ( not defined( $handle ) ) {
+            $error->( "File \"$name\" could not be opened for input: $!" );
+        }; # if
+    }; # if
+    if ( defined( $handle ) ) {
+        if ( $opts{ -binary } ) {
+            binmode( $handle );
+            local $/ = undef;   # Set input record separator to undef to read entire file as one line.
+            if ( exists( $opts{ -bulk } ) ) {
+                ${ $opts{ -bulk } } = $handle->getline();
+            } else {
+                $bulk[ 0 ] = $handle->getline();
+            }; # if
+        } else {
+            if ( defined( $opts{ -layer } ) ) {
+                binmode( $handle, $opts{ -layer } );
+            }; # if
+            @bulk = $handle->getlines();
+            # Special trick for UTF-8 files: Delete BOM, if any.
+            if ( defined( $opts{ -layer } ) and $opts{ -layer } eq ":utf8" ) {
+                if ( substr( $bulk[ 0 ], 0, 1 ) eq "\x{FEFF}" ) {
+                    substr( $bulk[ 0 ], 0, 1 ) = "";
+                }; # if
+            }; # if
+        }; # if
+        $handle->close()
+            or $error->( "File \"$name\" could not be closed after input: $!" );
+    } else {
+        if ( $opts{ -binary } and exists( $opts{ -bulk } ) ) {
+            ${ $opts{ -bulk } } = "";
+        }; # if
+    }; # if
+    if ( $opts{ -binary } ) {
+        if ( exists( $opts{ -bulk } ) ) {
+            return 1;
+        } else {
+            return $bulk[ 0 ];
+        }; # if
+    } else {
+        if ( ( @bulk > 0 ) and ( substr( $bulk[ -1 ], -1, 1 ) ne "\n" ) ) {
+            $bulk[ -1 ] .= "\n";
+        }; # if
+        if ( not $opts{ -keep_trailing_space } ) {
+            map( $_ =~ s/\s+\n\z/\n/, @bulk );
+        }; # if
+        if ( $opts{ -chomp } ) {
+            chomp( @bulk );
+        }; # if
+        if ( wantarray() ) {
+            return @bulk;
+        } else {
+            return join( "", @bulk );
+        }; # if
+    }; # if
+
+}; # sub read_file
+
+#--------------------------------------------------------------------------------------------------
+
+=head3 write_file
+
+B<Synopsis:>
+
+    write_file( $file, $bulk, @options )
+
+B<Description:>
+
+Write file.
+
+B<Arguments:>
+
+=over
+
+=item B<$file>
+
+The name or handle of file to writte to.
+
+=item B<$bulk>
+
+Bulk to write to a file. Can be a scalar, or a reference to scalar or an array.
+
+=back
+
+B<Options:>
+
+=over
+
+=item B<-backup>
+
+If true, create a backup copy of file overwritten. Backup copy is placed into the same directory.
+The name of backup copy is the same as the name of file with `~' appended. By default backup copy
+is not created.
+
+=item B<-append>
+
+If true, the text will be added to existing file.
+
+=back
+
+B<Examples:>
+
+    write_file( "message.txt", \$bulk );
+        # Write file, take content from a scalar.
+
+    write_file( "message.txt", \@bulk, -backup => 1 );
+        # Write file, take content from an array, create a backup copy.
+
+=cut
+
+sub write_file($$@) {
+
+    my $file = shift( @_ );  # The name or handle of file to write to.
+    my $bulk = shift( @_ );  # The text to write. Can be reference to array or scalar.
+    my %opts = @_;           # Options.
+
+    my $name;
+    my $handle;
+
+    check_opts( %opts, [ qw( -append -backup -binary -layer ) ] );
+
+    my $mode = $opts{ -append } ? "a": "w";
+    if ( ( ref( $file ) eq "GLOB" ) or UNIVERSAL::isa( $file, "IO::Handle" ) ) {
+        $name = "unknown";
+        $handle = $file;
+    } else {
+        $name = $file;
+        if ( $opts{ -backup } and ( -f $name ) ) {
+            copy_file( $name, $name . "~", -overwrite => 1 );
+        }; # if
+        $handle = IO::File->new( $name, $mode )
+            or runtime_error( "File \"$name\" could not be opened for output: $!" );
+    }; # if
+    if ( $opts{ -binary } ) {
+        binmode( $handle );
+    } elsif ( $opts{ -layer } ) {
+        binmode( $handle, $opts{ -layer } );
+    }; # if
+    if ( ref( $bulk ) eq "" ) {
+        if ( defined( $bulk ) ) {
+            $handle->print( $bulk );
+            if ( not $opts{ -binary } and ( substr( $bulk, -1 ) ne "\n" ) ) {
+                $handle->print( "\n" );
+            }; # if
+        }; # if
+    } elsif ( ref( $bulk ) eq "SCALAR" ) {
+        if ( defined( $$bulk ) ) {
+            $handle->print( $$bulk );
+            if ( not $opts{ -binary } and ( substr( $$bulk, -1 ) ne "\n" ) ) {
+                $handle->print( "\n" );
+            }; # if
+        }; # if
+    } elsif ( ref( $bulk ) eq "ARRAY" ) {
+        foreach my $line ( @$bulk ) {
+            if ( defined( $line ) ) {
+                $handle->print( $line );
+                if ( not $opts{ -binary } and ( substr( $line, -1 ) ne "\n" ) ) {
+                    $handle->print( "\n" );
+                }; # if
+            }; # if
+        }; # foreach
+    } else {
+        Carp::croak( "write_file: \$bulk must be a scalar or reference to (scalar or array)" );
+    }; # if
+    $handle->close()
+        or runtime_error( "File \"$name\" could not be closed after output: $!" );
+
+}; # sub write_file
+
+#--------------------------------------------------------------------------------------------------
+
+=cut
+
+# =================================================================================================
+# Execution subroutines.
+# =================================================================================================
+
+=head2 Execution subroutines.
+
+=over
+
+=cut
+
+#--------------------------------------------------------------------------------------------------
+
+sub _pre {
+
+    my $arg = shift( @_ );
+
+    # If redirection is not required, exit.
+    if ( not exists( $arg->{ redir } ) ) {
+        return 0;
+    }; # if
+
+    # Input parameters.
+    my $mode   = $arg->{ mode   }; # Mode, "<" (input ) or ">" (output).
+    my $handle = $arg->{ handle }; # Handle to manipulate.
+    my $redir  = $arg->{ redir  }; # Data, a file name if a scalar, or file contents, if a reference.
+
+    # Output parameters.
+    my $save_handle;
+    my $temp_handle;
+    my $temp_name;
+
+    # Save original handle (by duping it).
+    $save_handle = Symbol::gensym();
+    $handle->flush();
+    open( $save_handle, $mode . "&" . $handle->fileno() )
+        or die( "Cannot dup filehandle: $!" );
+
+    # Prepare a file to IO.
+    if ( UNIVERSAL::isa( $redir, "IO::Handle" ) or ( ref( $redir ) eq "GLOB" ) ) {
+        # $redir is reference to an object of IO::Handle class (or its decedant).
+        $temp_handle = $redir;
+    } elsif ( ref( $redir ) ) {
+        # $redir is a reference to content to be read/written.
+        # Prepare temp file.
+        ( $temp_handle, $temp_name ) =
+            File::Temp::tempfile(
+                "$tool.XXXXXXXX",
+                DIR    => File::Spec->tmpdir(),
+                SUFFIX => ".tmp",
+                UNLINK => 1
+            );
+        if ( not defined( $temp_handle ) ) {
+            runtime_error( "Could not create temp file." );
+        }; # if
+        if ( $mode eq "<" ) {
+            # It is a file to be read by child, prepare file content to be read.
+            $temp_handle->print( ref( $redir ) eq "SCALAR" ? ${ $redir } : @{ $redir } );
+            $temp_handle->flush();
+            seek( $temp_handle, 0, 0 );
+                # Unfortunatelly, I could not use OO interface to seek.
+                # ActivePerl 5.6.1 complains on both forms:
+                #    $temp_handle->seek( 0 );    # As declared in IO::Seekable.
+                #    $temp_handle->setpos( 0 );  # As described in documentation.
+        } elsif ( $mode eq ">" ) {
+            # It is a file for output. Clear output variable.
+            if ( ref( $redir ) eq "SCALAR" ) {
+                ${ $redir } = "";
+            } else {
+                @{ $redir } = ();
+            }; # if
+        }; # if
+    } else {
+        # $redir is a name of file to be read/written.
+        # Just open file.
+        if ( defined( $redir ) ) {
+            $temp_name = $redir;
+        } else {
+            $temp_name = File::Spec->devnull();
+        }; # if
+        $temp_handle = IO::File->new( $temp_name, $mode )
+            or runtime_error( "file \"$temp_name\" could not be opened for " . ( $mode eq "<" ? "input" : "output" ) . ": $!" );
+    }; # if
+
+    # Redirect handle to temp file.
+    open( $handle, $mode . "&" . $temp_handle->fileno() )
+        or die( "Cannot dup filehandle: $!" );
+
+    # Save output parameters.
+    $arg->{ save_handle } = $save_handle;
+    $arg->{ temp_handle } = $temp_handle;
+    $arg->{ temp_name   } = $temp_name;
+
+}; # sub _pre
+
+
+sub _post {
+
+    my $arg = shift( @_ );
+
+    # Input parameters.
+    my $mode   = $arg->{ mode   }; # Mode, "<" or ">".
+    my $handle = $arg->{ handle }; # Handle to save and set.
+    my $redir  = $arg->{ redir  }; # Data, a file name if a scalar, or file contents, if a reference.
+
+    # Parameters saved during preprocessing.
+    my $save_handle = $arg->{ save_handle };
+    my $temp_handle = $arg->{ temp_handle };
+    my $temp_name   = $arg->{ temp_name   };
+
+    # If no handle was saved, exit.
+    if ( not $save_handle ) {
+        return 0;
+    }; # if
+
+    # Close handle.
+    $handle->close()
+        or die( "$!" );
+
+    # Read the content of temp file, if necessary, and close temp file.
+    if ( ( $mode ne "<" ) and ref( $redir ) ) {
+        $temp_handle->flush();
+        seek( $temp_handle, 0, 0 );
+        if ( $^O =~ m/MSWin/ ) {
+            binmode( $temp_handle, ":crlf" );
+        }; # if
+        if ( ref( $redir ) eq "SCALAR" ) {
+            ${ $redir } .= join( "", $temp_handle->getlines() );
+        } elsif ( ref( $redir ) eq "ARRAY" ) {
+            push( @{ $redir }, $temp_handle->getlines() );
+        }; # if
+    }; # if
+    if ( not UNIVERSAL::isa( $redir, "IO::Handle" ) ) {
+        $temp_handle->close()
+            or die( "$!" );
+    }; # if
+
+    # Restore handle to original value.
+    $save_handle->flush();
+    open( $handle, $mode . "&" . $save_handle->fileno() )
+        or die( "Cannot dup filehandle: $!" );
+
+    # Close save handle.
+    $save_handle->close()
+        or die( "$!" );
+
+    # Delete parameters saved during preprocessing.
+    delete( $arg->{ save_handle } );
+    delete( $arg->{ temp_handle } );
+    delete( $arg->{ temp_name   } );
+
+}; # sub _post
+
+#--------------------------------------------------------------------------------------------------
+
+=item C<execute( [ @command ], @options )>
+
+Execute specified program or shell command.
+
+Program is specified by reference to an array, that array is passed to C<system()> function which
+executes the command. See L<perlfunc> for details how C<system()> interprets various forms of
+C<@command>.
+
+By default, in case of any error error message is issued and script terminated (by runtime_error()).
+Function returns an exit code of program.
+
+Alternatively, he function may return exit status of the program (see C<-ignore_status>) or signal
+(see C<-ignore_signal>) so caller may analyze it and continue execution.
+
+Options:
+
+=over
+
+=item C<-stdin>
+
+Redirect stdin of program. The value of option can be:
+
+=over
+
+=item C<undef>
+
+Stdin of child is attached to null device.
+
+=item a string
+
+Stdin of child is attached to a file with name specified by option.
+
+=item a reference to a scalar
+
+A dereferenced scalar is written to a temp file, and child's stdin is attached to that file.
+
+=item a reference to an array
+
+A dereferenced array is written to a temp file, and child's stdin is attached to that file.
+
+=back
+
+=item C<-stdout>
+
+Redirect stdout. Possible values are the same as for C<-stdin> option. The only difference is
+reference specifies a variable receiving program's output.
+
+=item C<-stderr>
+
+It similar to C<-stdout>, but redirects stderr. There is only one additional value:
+
+=over
+
+=item an empty string
+
+means that stderr should be redirected to the same place where stdout is redirected to.
+
+=back
+
+=item C<-append>
+
+Redirected stream will not overwrite previous content of file (or variable).
+Note, that option affects both stdout and stderr.
+
+=item C<-ignore_status>
+
+By default, subroutine raises an error and exits the script if program returns non-exit status. If
+this options is true, no error is raised. Instead, status is returned as function result (and $@ is
+set to error message).
+
+=item C<-ignore_signal>
+
+By default, subroutine raises an error and exits the script if program die with signal. If
+this options is true, no error is raised in such a case. Instead, signal number is returned (as
+negative value), error message is placed to C<$@> variable.
+
+If command is not even started, -256 is returned.
+
+=back
+
+Examples:
+
+    execute( [ "cmd.exe", "/c", "dir" ] );
+        # Execute NT shell with specified options, no redirections are
+        # made.
+
+    my $output;
+    execute( [ "cvs", "-n", "-q", "update", "." ], -stdout => \$output );
+        # Execute "cvs -n -q update ." command, output is saved
+        # in $output variable.
+
+    my @output;
+    execute( [ qw( cvs -n -q update . ) ], -stdout => \@output, -stderr => undef );
+        # Execute specified command,  output is saved in @output
+        # variable, stderr stream is redirected to null device
+        # (/dev/null in Linux* OS an nul in Windows* OS).
+
+=cut
+
+sub execute($@) {
+
+    # !!! Add something to complain on unknown options...
+
+    my $command = shift( @_ );
+    my %opts    = @_;
+    my $prefix  = "Could not execute $command->[ 0 ]";
+
+    check_opts( %opts, [ qw( -stdin -stdout -stderr -append -ignore_status -ignore_signal ) ] );
+
+    if ( ref( $command ) ne "ARRAY" ) {
+        Carp::croak( "execute: $command must be a reference to array" );
+    }; # if
+
+    my $stdin  = { handle => \*STDIN,  mode => "<" };
+    my $stdout = { handle => \*STDOUT, mode => ">" };
+    my $stderr = { handle => \*STDERR, mode => ">" };
+    my $streams = {
+        stdin  => $stdin,
+        stdout => $stdout,
+        stderr => $stderr
+    }; # $streams
+
+    for my $stream ( qw( stdin stdout stderr ) ) {
+        if ( exists( $opts{ "-$stream" } ) ) {
+            if ( ref( $opts{ "-$stream" } ) !~ m/\A(|SCALAR|ARRAY)\z/ ) {
+                Carp::croak( "execute: -$stream option: must have value of scalar, or reference to (scalar or array)." );
+            }; # if
+            $streams->{ $stream }->{ redir } = $opts{ "-$stream" };
+        }; # if
+        if ( $opts{ -append } and ( $streams->{ $stream }->{ mode } ) eq ">" ) {
+            $streams->{ $stream }->{ mode } = ">>";
+        }; # if
+    }; # foreach $stream
+
+    _pre( $stdin  );
+    _pre( $stdout );
+    if ( defined( $stderr->{ redir } ) and not ref( $stderr->{ redir } ) and ( $stderr->{ redir } eq "" ) ) {
+        if ( exists( $stdout->{ redir } ) ) {
+            $stderr->{ redir } = $stdout->{ temp_handle };
+        } else {
+            $stderr->{ redir } = ${ $stdout->{ handle } };
+        }; # if
+    }; # if
+    _pre( $stderr );
+    my $rc = system( @$command );
+    my $errno = $!;
+    my $child = $?;
+    _post( $stderr );
+    _post( $stdout );
+    _post( $stdin  );
+
+    my $exit = 0;
+    my $signal_num  = $child & 127;
+    my $exit_status = $child >> 8;
+    $@ = "";
+
+    if ( $rc == -1 ) {
+        $@ = "\"$command->[ 0 ]\" failed: $errno";
+        $exit = -256;
+        if ( not $opts{ -ignore_signal } ) {
+            runtime_error( $@ );
+        }; # if
+    } elsif ( $signal_num != 0 ) {
+        $@ = "\"$command->[ 0 ]\" failed due to signal $signal_num.";
+        $exit = - $signal_num;
+        if ( not $opts{ -ignore_signal } ) {
+            runtime_error( $@ );
+        }; # if
+    } elsif ( $exit_status != 0 ) {
+        $@ = "\"$command->[ 0 ]\" returned non-zero status $exit_status.";
+        $exit = $exit_status;
+        if ( not $opts{ -ignore_status } ) {
+            runtime_error( $@ );
+        }; # if
+    }; # if
+
+    return $exit;
+
+}; # sub execute
+
+#--------------------------------------------------------------------------------------------------
+
+=item C<backticks( [ @command ], @options )>
+
+Run specified program or shell command and return output.
+
+In scalar context entire output is returned in a single string. In list context list of strings
+is returned. Function issues an error and exits script if any error occurs.
+
+=cut
+
+
+sub backticks($@) {
+
+    my $command = shift( @_ );
+    my %opts    = @_;
+    my @output;
+
+    check_opts( %opts, [ qw( -chomp ) ] );
+
+    execute( $command, -stdout => \@output );
+
+    if ( $opts{ -chomp } ) {
+        chomp( @output );
+    }; # if
+
+    return ( wantarray() ? @output : join( "", @output ) );
+
+}; # sub backticks
+
+#--------------------------------------------------------------------------------------------------
+
+sub pad($$$) {
+    my ( $str, $length, $pad ) = @_;
+    my $lstr = length( $str );    # Length of source string.
+    if ( $lstr < $length ) {
+        my $lpad  = length( $pad );                         # Length of pad.
+        my $count = int( ( $length - $lstr ) / $lpad );     # Number of pad repetitions.
+        my $tail  = $length - ( $lstr + $lpad * $count );
+        $str = $str . ( $pad x $count ) . substr( $pad, 0, $tail );
+    }; # if
+    return $str;
+}; # sub pad
+
+# --------------------------------------------------------------------------------------------------
+
+=back
+
+=cut
+
+#--------------------------------------------------------------------------------------------------
+
+return 1;
+
+#--------------------------------------------------------------------------------------------------
+
+=cut
+
+# End of file.
diff --git a/final/runtime/tools/message-converter.pl b/final/runtime/tools/message-converter.pl
new file mode 100755
index 0000000..d72acf0
--- /dev/null
+++ b/final/runtime/tools/message-converter.pl
@@ -0,0 +1,774 @@
+#!/usr/bin/perl
+
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+use strict;
+use warnings;
+
+use File::Glob ":glob";
+use Encode qw{ encode };
+
+use FindBin;
+use lib "$FindBin::Bin/lib";
+
+use tools;
+
+our $VERSION = "0.04";
+my $escape      = qr{%};
+my $placeholder = qr{(\d)\$(s|l?[du])};
+my $target_os;
+
+my $sections =
+    {
+        meta     => { short => "prp" }, # "prp" stands for "property".
+        strings  => { short => "str" },
+        formats  => { short => "fmt" },
+        messages => { short => "msg" },
+        hints    => { short => "hnt" },
+    };
+my @sections = qw{ meta strings formats messages hints };
+# Assign section properties: long name, set number, base number.
+map( $sections->{ $sections[ $_ ] }->{ long } = $sections[ $_ ],      ( 0 .. @sections - 1 ) );
+map( $sections->{ $sections[ $_ ] }->{ set  } = ( $_ + 1 ),           ( 0 .. @sections - 1 ) );
+map( $sections->{ $sections[ $_ ] }->{ base } = ( ( $_ + 1 ) << 16 ), ( 0 .. @sections - 1 ) );
+
+# Properties of Meta section.
+my @properties = qw{ Language Country LangId Version Revision };
+
+
+sub _generate_comment($$$) {
+
+    my ( $data, $open, $close ) = @_;
+    my $bulk =
+        $open . " Do not edit this file! " . $close . "\n" .
+        $open . " The file was generated from " . get_file( $data->{ "%meta" }->{ source } ) .
+            " by " . $tool . " on " . localtime() . ". " . $close . "\n";
+    return $bulk;
+
+}; # sub _generate_comment
+
+
+sub msg2sgn($) {
+
+    # Convert message string to signature. Signature is a list of placeholders in sorted order.
+    # For example, signature of "%1$s value \"%2$s\" is invalid." is "%1$s %2$s".
+
+    my ( $msg ) = @_;
+    my @placeholders;
+    pos( $msg ) = 0;
+    while ( $msg =~ m{\G.*?$escape$placeholder}g ) {
+        $placeholders[ $1 - 1 ] = "%$1\$$2";
+    }; # while
+    for ( my $i = 1; $i <= @placeholders; ++ $i ) {
+        if ( not defined( $placeholders[ $i - 1 ] ) ) {
+            $placeholders[ $i - 1 ] = "%$i\$-";
+        }; # if
+    }; # for $i
+    return join( " ", @placeholders );
+
+}; # sub msg2sgn
+
+
+sub msg2src($) {
+
+    # Convert message string to a C string constant.
+
+    my ( $msg ) = @_;
+    if ( $target_os eq "win" ) {
+        $msg =~ s{$escape$placeholder}{\%$1!$2!}g;
+    }; # if
+    return $msg;
+
+}; # sub msg2src
+
+
+my $special =
+    {
+        "n" => "\n",
+        "t" => "\t",
+    };
+
+sub msg2mc($) {
+    my ( $msg ) = @_;
+    $msg = msg2src( $msg ); # Get windows style placeholders.
+    $msg =~ s{\\(.)}{ exists( $special->{ $1 } ) ? $special->{ $1 } : $1 }ge;
+    return $msg;
+}; # sub msg2mc
+
+
+
+sub parse_message($) {
+
+    my ( $msg ) = @_;
+    pos( $msg ) = 0;
+    for ( ; ; ) {
+        if ( $msg !~ m{\G.*?$escape}gc ) {
+            last;
+        }
+        if ( $msg !~ m{\G$placeholder}gc ) {
+            return "Bad %-sequence near \"%" . substr( $msg, pos( $msg ), 7 ) . "\"";
+        }; # if
+    }; # forever
+    return undef;
+
+}; # sub parse_message
+
+
+sub parse_source($) {
+
+    my ( $name ) = @_;
+
+    my @bulk = read_file( $name, -layer => ":utf8" );
+    my $data = {};
+
+    my $line;
+    my $n = 0;         # Line number.
+    my $obsolete = 0;  # Counter of obsolete entries.
+    my $last_idx;
+    my %idents;
+    my $section;
+
+    my $error =
+        sub {
+            my ( $n, $line, $msg ) = @_;
+            runtime_error( "Error parsing $name line $n: " . "$msg:\n" . "    $line" );
+        }; # sub
+
+    foreach $line ( @bulk ) {
+        ++ $n;
+        # Skip empty lines and comments.
+        if ( $line =~ m{\A\s*(\n|#)} ) {
+            $last_idx = undef;
+            next;
+        }; # if
+        # Parse section header.
+        if ( $line =~ m{\A-\*-\s*([A-Z_]*)\s*-\*-\s*\n\z}i ) {
+            $section = ( lc( $1 ) );
+            if ( not grep( $section eq $_, @sections ) ) {
+                $error->( $n, $line, "Unknown section \"$section\" specified" );
+            }; # if
+            if ( exists( $data->{ $section } ) ) {
+                $error->( $n, $line, "Multiple sections of the same type specified" );
+            }; # if
+            %idents = ();     # Clean list of known message identifiers.
+            next;
+        }; # if
+        if ( not defined( $section ) ) {
+            $error->( $n, $line, "Section heading expected" );
+        }; # if
+        # Parse section body.
+        if ( $section eq "meta" ) {
+            if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
+                # Parse meta properties (such as Language, Country, and LangId).
+                my ( $property, $value ) = ( $1, $2 );
+                if ( not grep( $_ eq $property , @properties ) ) {
+                    $error->( $n, $line, "Unknown property \"$property\" specified" );
+                }; # if
+                if ( exists( $data->{ "%meta" }->{ $property } ) ) {
+                    $error->( $n, $line, "Property \"$property\" has already been specified" );
+                }; # if
+                $data->{ "%meta" }->{ $property } = $value;
+                $last_idx = undef;
+                next;
+            }; # if
+            $error->( $n, $line, "Property line expected" );
+        }; # if
+        # Parse message.
+        if ( $line =~ m{\A([A-Z_][A-Z_0-9]*)\s+"(.*)"\s*?\n?\z}i ) {
+            my ( $ident, $message ) = ( $1, $2 );
+            if ( $ident eq "OBSOLETE" ) {
+                # If id is "OBSOLETE", add a unique suffix. It provides convenient way to mark
+                # obsolete messages.
+                ++ $obsolete;
+                $ident .= $obsolete;
+            }; # if
+            if ( exists( $idents{ $ident } ) ) {
+                $error->( $n, $line, "Identifier \"$ident\" is redefined" );
+            }; # if
+            # Check %-sequences.
+            my $err = parse_message( $message );
+            if ( $err ) {
+                $error->( $n, $line, $err );
+            }; # if
+            # Save message.
+            push( @{ $data->{ $section } }, [ $ident, $message ] );
+            $idents{ $ident } = 1;
+            $last_idx = @{ $data->{ $section } } - 1;
+            next;
+        }; # if
+        # Parse continuation line.
+        if ( $line =~ m{\A\s*"(.*)"\s*\z} ) {
+            my $message = $1;
+            if ( not defined( $last_idx )  ) {
+                $error->( $n, $line, "Unexpected continuation line" );
+            }; # if
+            # Check %-sequences.
+            my $err = parse_message( $message );
+            if ( $err ) {
+                $error->( $n, $line, $err );
+            }; # if
+            # Save continuation.
+            $data->{ $section }->[ $last_idx ]->[ 1 ] .= $message;
+            next;
+        }; # if
+        $error->( $n, $line, "Message definition expected" );
+    }; # foreach
+    $data->{ "%meta" }->{ source } = $name;
+    foreach my $section ( @sections ) {
+        if ( not exists( $data->{ $section } ) ) {
+            $data->{ $section } = [];
+        }; # if
+    }; # foreach $section
+
+    foreach my $property ( @properties ) {
+        if ( not defined( $data->{ "%meta" }->{ $property } ) ) {
+            runtime_error(
+                "Error parsing $name: " .
+                    "Required \"$property\" property is not specified"
+            );
+        }; # if
+        push( @{ $data->{ meta } }, [ $property, $data->{ "%meta" }->{ $property } ] );
+    }; # foreach
+
+    return $data;
+
+}; # sub parse_source
+
+
+sub generate_enum($$$) {
+
+    my ( $data, $file, $prefix ) = @_;
+    my $bulk = "";
+
+    $bulk =
+        _generate_comment( $data, "//", "//" ) .
+        "\n" .
+        "enum ${prefix}_id {\n\n" .
+        "    // A special id for absence of message.\n" .
+        "    ${prefix}_null = 0,\n\n";
+
+    foreach my $section ( @sections ) {
+        my $props = $sections->{ $section };    # Section properties.
+        my $short = $props->{ short };          # Short section name, frequently used.
+        $bulk .=
+            "    // Set #$props->{ set }, $props->{ long }.\n" .
+            "    ${prefix}_${short}_first = $props->{ base },\n";
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( $ident, undef ) = @$item;
+            $bulk .= "    ${prefix}_${short}_${ident},\n";
+        }; # foreach
+        $bulk .= "    ${prefix}_${short}_last,\n\n";
+    }; # foreach $type
+    $bulk .= "    ${prefix}_xxx_lastest\n\n";
+
+    $bulk .=
+        "}; // enum ${prefix}_id\n" .
+        "\n" .
+        "typedef enum ${prefix}_id  ${prefix}_id_t;\n" .
+        "\n";
+
+    $bulk .=
+        "\n" .
+        "// end of file //\n";
+
+    write_file( $file, \$bulk );
+
+}; # sub generate_enum
+
+
+sub generate_signature($$) {
+
+    my ( $data, $file ) = @_;
+    my $bulk = "";
+
+    $bulk .= "// message catalog signature file //\n\n";
+
+    foreach my $section ( @sections ) {
+        my $props = $sections->{ $section };    # Section properties.
+        my $short = $props->{ short };          # Short section name, frequently used.
+        $bulk .= "-*- " . uc( $props->{ long } ) . "-*-\n\n";
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( $ident, $msg ) = @$item;
+            $bulk .= sprintf( "%-40s %s\n", $ident, msg2sgn( $msg ) );
+        }; # foreach
+        $bulk .= "\n";
+    }; # foreach $type
+
+    $bulk .= "// end of file //\n";
+
+    write_file( $file, \$bulk );
+
+}; # sub generate_signature
+
+
+sub generate_default($$$) {
+
+    my ( $data, $file, $prefix ) = @_;
+    my $bulk = "";
+
+    $bulk .=
+        _generate_comment( $data, "//", "//" ) .
+        "\n";
+
+    foreach my $section ( @sections ) {
+        $bulk .=
+            "static char const *\n" .
+            "__${prefix}_default_${section}" . "[] =\n" .
+            "    {\n" .
+            "        NULL,\n";
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( undef, $msg ) = @$item;
+            $bulk .= "        \"" . msg2src( $msg ) . "\",\n";
+        }; # while
+        $bulk .=
+            "        NULL\n" .
+            "    };\n" .
+            "\n";
+    }; # foreach $type
+
+    $bulk .=
+        "struct kmp_i18n_section {\n" .
+        "    int           size;\n" .
+        "    char const ** str;\n" .
+        "}; // struct kmp_i18n_section\n" .
+        "typedef struct kmp_i18n_section  kmp_i18n_section_t;\n" .
+        "\n" .
+        "static kmp_i18n_section_t\n" .
+        "__${prefix}_sections[] =\n" .
+        "    {\n" .
+        "        { 0, NULL },\n";
+    foreach my $section ( @sections ) {
+        $bulk .=
+            "        { " . @{ $data->{ $section } } . ", __${prefix}_default_${section} },\n";
+    }; # foreach $type
+    $bulk .=
+        "        { 0, NULL }\n" .
+        "    };\n" .
+        "\n";
+
+    $bulk .=
+        "struct kmp_i18n_table {\n" .
+        "    int                   size;\n" .
+        "    kmp_i18n_section_t *  sect;\n" .
+        "}; // struct kmp_i18n_table\n" .
+        "typedef struct kmp_i18n_table  kmp_i18n_table_t;\n" .
+        "\n" .
+        "static kmp_i18n_table_t __kmp_i18n_default_table =\n" .
+        "    {\n" .
+        "        " . @sections . ",\n" .
+        "        __kmp_i18n_sections\n" .
+        "    };\n" .
+        "\n" .
+        "// end of file //\n";
+
+    write_file( $file, \$bulk );
+
+}; # sub generate_default
+
+
+sub generate_message_unix($$) {
+
+    my ( $data, $file ) = @_;
+    my $bulk     = "";
+
+    $bulk .=
+        _generate_comment( $data, "\$", "\$" ) .
+        "\n" .
+        "\$quote \"\n\n";
+
+    foreach my $section ( @sections ) {
+        $bulk .=
+            "\$ " . ( "-" x 78 ) . "\n\$ $section\n\$ " . ( "-" x 78 ) . "\n\n" .
+            "\$set $sections->{ $section }->{ set }\n" .
+            "\n";
+        my $n = 0;
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( undef, $msg ) = @$item;
+            ++ $n;
+            $bulk .= "$n \"" . msg2src( $msg ) . "\"\n";
+        }; # foreach
+        $bulk .= "\n";
+    }; # foreach $type
+
+    $bulk .=
+        "\n" .
+        "\$ end of file \$\n";
+
+    write_file( $file, \$bulk, -layer => ":utf8" );
+
+}; # sub generate_message_linux
+
+
+sub generate_message_windows($$) {
+
+    my ( $data, $file ) = @_;
+    my $bulk = "";
+    my $language = $data->{ "%meta" }->{ Language };
+    my $langid   = $data->{ "%meta" }->{ LangId };
+
+    $bulk .=
+        _generate_comment( $data, ";", ";" ) .
+        "\n" .
+        "LanguageNames = ($language=$langid:msg_$langid)\n" .
+        "\n";
+
+    $bulk .=
+        "FacilityNames=(\n";
+    foreach my $section ( @sections ) {
+        my $props = $sections->{ $section };    # Section properties.
+        $bulk .=
+            " $props->{ short }=" . $props->{ set } ."\n";
+    }; # foreach $section
+    $bulk .=
+        ")\n\n";
+
+    foreach my $section ( @sections ) {
+        my $short = $sections->{ $section }->{ short };
+        my $n = 0;
+        foreach my $item ( @{ $data->{ $section } } ) {
+            my ( undef, $msg ) = @$item;
+            ++ $n;
+            $bulk .=
+                "MessageId=$n\n" .
+                "Facility=$short\n" .
+                "Language=$language\n" .
+                msg2mc( $msg ) . "\n.\n\n";
+        }; # foreach $item
+    }; # foreach $section
+
+    $bulk .=
+        "\n" .
+        "; end of file ;\n";
+
+    $bulk = encode( "UTF-16LE", $bulk ); # Convert text to UTF-16LE used in Windows* OS.
+    write_file( $file, \$bulk, -binary => 1 );
+
+}; # sub generate_message_windows
+
+
+#
+# Parse command line.
+#
+
+my $input_file;
+my $enum_file;
+my $signature_file;
+my $default_file;
+my $message_file;
+my $id;
+my $prefix = "";
+get_options(
+    "os=s"             => \$target_os,
+    "enum-file=s"      => \$enum_file,
+    "signature-file=s" => \$signature_file,
+    "default-file=s"   => \$default_file,
+    "message-file=s"   => \$message_file,
+    "id|lang-id"       => \$id,
+    "prefix=s"	       => \$prefix,
+);
+if ( @ARGV == 0 ) {
+    cmdline_error( "No source file specified -- nothing to do" );
+}; # if
+if ( @ARGV > 1 ) {
+    cmdline_error( "Too many source files specified" );
+}; # if
+$input_file = $ARGV[ 0 ];
+
+
+my $generate_message;
+if ( $target_os =~ m{\A(?:lin|mac)\z} ) {
+    $generate_message = \&generate_message_unix;
+} elsif ( $target_os eq "win" ) {
+    $generate_message = \&generate_message_windows;
+} else {
+    runtime_error( "OS \"$target_os\" is not supported" );
+}; # if
+
+
+#
+# Do the work.
+#
+
+my $data = parse_source( $input_file );
+if ( defined( $id ) ) {
+    print( $data->{ "%meta" }->{ LangId }, "\n" );
+}; # if
+if ( defined( $enum_file ) ) {
+    generate_enum( $data, $enum_file, $prefix );
+}; # if
+if ( defined( $signature_file ) ) {
+    generate_signature( $data, $signature_file );
+}; # if
+if ( defined( $default_file ) ) {
+    generate_default( $data, $default_file, $prefix );
+}; # if
+if ( defined( $message_file ) ) {
+    $generate_message->( $data, $message_file );
+}; # if
+
+exit( 0 );
+
+__END__
+
+=pod
+
+=head1 NAME
+
+B<message-converter.pl> -- Convert message catalog source file into another text forms.
+
+=head1 SYNOPSIS
+
+B<message-converter.pl> I<option>... <file>
+
+=head1 OPTIONS
+
+=over
+
+=item B<--enum-file=>I<file>
+
+Generate enum file named I<file>.
+
+=item B<--default-file=>I<file>
+
+Generate default messages file named I<file>.
+
+=item B<--lang-id>
+
+Print language identifier of the message catalog source file.
+
+=item B<--message-file=>I<file>
+
+Generate message file.
+
+=item B<--signature-file=>I<file>
+
+Generate signature file.
+
+Signatures are used for checking compatibility. For example, to check a primary
+catalog and its translation to another language, signatures of both catalogs should be generated
+and compared. If signatures are identical, catalogs are compatible.
+
+=item B<--prefix=>I<prefix>
+
+Prefix to be used for all C identifiers (type and variable names) in enum and default messages
+files.
+
+=item B<--os=>I<str>
+
+Specify OS name the message formats to be converted for. If not specified expolicitly, value of
+LIBOMP_OS environment variable is used. If LIBOMP_OS is not defined, host OS is detected.
+
+Depending on OS, B<message-converter.pl> converts message formats to GNU style or MS style.
+
+=item Standard Options
+
+=over
+
+=item B<--doc>
+
+=item B<--manual>
+
+Print full documentation and exit.
+
+=item B<--help>
+
+Print short help message and exit.
+
+=item B<--version>
+
+Print version string and exit.
+
+=back
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item I<file>
+
+A name of input file.
+
+=back
+
+=head1 DESCRIPTION
+
+=head2 Message Catalog File Format
+
+It is plain text file in UTF-8 encoding. Empty lines and lines beginning with sharp sign (C<#>) are
+ignored. EBNF syntax of content:
+
+    catalog    = { section };
+    section    = header body;
+    header     = "-*- " section-id " -*-" "\n";
+    body       = { message };
+    message    = message-id string "\n" { string "\n" };
+    section-id = identifier;
+    message-id = "OBSOLETE" | identifier;
+    identifier = letter { letter | digit | "_" };
+    string     = """ { character } """;
+
+Identifier starts with letter, with following letters, digits, and underscores. Identifiers are
+case-sensitive. Setion identifiers are fixed: C<META>, C<STRINGS>, C<FORMATS>, C<MESSAGES> and
+C<HINTS>. Message identifiers must be unique within section. Special C<OBSOLETE> pseudo-identifier
+may be used many times.
+
+String is a C string literal which must not cross line boundaries.
+Long messages may occupy multiple lines, a string per line.
+
+Message may include printf-like GNU-style placeholders for arguments: C<%I<n>$I<t>>,
+where I<n> is argument number (C<1>, C<2>, ...),
+I<t> -- argument type, C<s> (string) or C<d> (32-bit integer).
+
+See also comments in F<i18n/en_US.txt>.
+
+=head2 Output Files
+
+This script can generate 3 different text files from single source:
+
+=over
+
+=item Enum file.
+
+Enum file is a C include file, containing definitions of message identifiers, e. g.:
+
+    enum kmp_i18n_id {
+
+        // Set #1, meta.
+        kmp_i18n_prp_first = 65536,
+        kmp_i18n_prp_Language,
+        kmp_i18n_prp_Country,
+        kmp_i18n_prp_LangId,
+        kmp_i18n_prp_Version,
+        kmp_i18n_prp_Revision,
+        kmp_i18n_prp_last,
+
+        // Set #2, strings.
+        kmp_i18n_str_first = 131072,
+        kmp_i18n_str_Error,
+        kmp_i18n_str_UnknownFile,
+        kmp_i18n_str_NotANumber,
+        ...
+
+        // Set #3, fotrmats.
+        ...
+
+        kmp_i18n_xxx_lastest
+
+    }; // enum kmp_i18n_id
+
+    typedef enum kmp_i18n_id  kmp_i18n_id_t;
+
+=item Default messages file.
+
+Default messages file is a C include file containing default messages to be embedded into
+application (and used if external message catalog does not exist or could not be open):
+
+    static char const *
+    __kmp_i18n_default_meta[] =
+        {
+            NULL,
+            "English",
+            "USA",
+            "1033",
+            "2",
+            "20090806",
+            NULL
+        };
+
+    static char const *
+    __kmp_i18n_default_strings[] =
+        {
+            "Error",
+            "(unknown file)",
+            "not a number",
+            ...
+            NULL
+        };
+
+    ...
+
+=item Message file.
+
+Message file is an input for message compiler, F<gencat> on Linux* OS and OS X*, or F<mc.exe> on
+Windows* OS.
+
+Here is the example of Linux* OS message file:
+
+    $quote "
+    1 "Japanese"
+    2 "Japan"
+    3 "1041"
+    4 "2"
+    5 "Based on Enlish message catalog revision 20090806"
+    ...
+
+Example of Windows* OS message file:
+
+    LanguageNames = (Japanese=10041:msg_1041)
+
+    FacilityNames = (
+     prp=1
+     str=2
+     fmt=3
+     ...
+    )
+
+    MessageId=1
+    Facility=prp
+    Language=Japanese
+    Japanese
+    .
+
+    ...
+
+=item Signature.
+
+Signature is a processed source file: comments stripped, strings deleted, but placeholders kept and
+sorted.
+
+    -*- FORMATS-*-
+
+    Info                                     %1$d %2$s
+    Warning                                  %1$d %2$s
+    Fatal                                    %1$d %2$s
+    SysErr                                   %1$d %2$s
+    Hint                                     %1$- %2$s
+    Pragma                                   %1$s %2$s %3$s %4$s
+
+The purpose of signatures -- compare two message source files for compatibility. If signatures of
+two message sources are the same, binary message catalogs will be compatible.
+
+=back
+
+=head1 EXAMPLES
+
+Generate include file containing message identifiers:
+
+    $ message-converter.pl --enum-file=kmp_i18n_id.inc en_US.txt
+
+Generate include file contating default messages:
+
+    $ message-converter.pl --default-file=kmp_i18n_default.inc en_US.txt
+
+Generate input file for message compiler, Linux* OS example:
+
+    $ message-converter.pl --message-file=ru_RU.UTF-8.msg ru_RU.txt
+
+Generate input file for message compiler, Windows* OS example:
+
+    > message-converter.pl --message-file=ru_RU.UTF-8.mc ru_RU.txt
+
+=cut
+
+# end of file #
+
diff --git a/final/runtime/tools/summarizeStats.py b/final/runtime/tools/summarizeStats.py
new file mode 100644
index 0000000..f2c5f5e
--- /dev/null
+++ b/final/runtime/tools/summarizeStats.py
@@ -0,0 +1,323 @@
+#!/usr/bin/python
+
+import pandas as pd
+import numpy as np
+import re
+import sys
+import os
+import argparse
+import matplotlib
+from matplotlib import pyplot as plt
+from matplotlib.projections.polar import PolarAxes
+from matplotlib.projections import register_projection
+
+"""
+Read the stats file produced by the OpenMP runtime
+and produce a processed summary
+
+The radar_factory original code was taken from
+matplotlib.org/examples/api/radar_chart.html
+We added support to handle negative values for radar charts
+"""
+
+def radar_factory(num_vars, frame='circle'):
+    """Create a radar chart with num_vars axes."""
+    # calculate evenly-spaced axis angles
+    theta = 2*np.pi * np.linspace(0, 1-1./num_vars, num_vars)
+    # rotate theta such that the first axis is at the top
+    #theta += np.pi/2
+
+    def draw_poly_frame(self, x0, y0, r):
+        # TODO: use transforms to convert (x, y) to (r, theta)
+        verts = [(r*np.cos(t) + x0, r*np.sin(t) + y0) for t in theta]
+        return plt.Polygon(verts, closed=True, edgecolor='k')
+
+    def draw_circle_frame(self, x0, y0, r):
+        return plt.Circle((x0, y0), r)
+
+    frame_dict = {'polygon': draw_poly_frame, 'circle': draw_circle_frame}
+    if frame not in frame_dict:
+        raise ValueError, 'unknown value for `frame`: %s' % frame
+
+    class RadarAxes(PolarAxes):
+        """
+        Class for creating a radar chart (a.k.a. a spider or star chart)
+
+        http://en.wikipedia.org/wiki/Radar_chart
+        """
+        name = 'radar'
+        # use 1 line segment to connect specified points
+        RESOLUTION = 1
+        # define draw_frame method
+        draw_frame = frame_dict[frame]
+
+        def fill(self, *args, **kwargs):
+            """Override fill so that line is closed by default"""
+            closed = kwargs.pop('closed', True)
+            return super(RadarAxes, self).fill(closed=closed, *args, **kwargs)
+
+        def plot(self, *args, **kwargs):
+            """Override plot so that line is closed by default"""
+            lines = super(RadarAxes, self).plot(*args, **kwargs)
+            #for line in lines:
+            #    self._close_line(line)
+
+        def set_varlabels(self, labels):
+            self.set_thetagrids(theta * 180/np.pi, labels,fontsize=14)
+
+        def _gen_axes_patch(self):
+            x0, y0 = (0.5, 0.5)
+            r = 0.5
+            return self.draw_frame(x0, y0, r)
+
+    register_projection(RadarAxes)
+    return theta
+
+# Code to read the raw stats
+def extractSI(s):
+    """Convert a measurement with a range suffix into a suitably scaled value"""
+    du     = s.split()
+    num    = float(du[0])
+    units  = du[1] if len(du) == 2 else ' '
+    # http://physics.nist.gov/cuu/Units/prefixes.html
+    factor = {'Y':  1e24,
+              'Z':  1e21,
+              'E':  1e18,
+              'P':  1e15,
+              'T':  1e12,
+              'G':  1e9,
+              'M':  1e6,
+              'k':  1e3,
+              ' ':  1  ,
+              'm': -1e3, # Yes, I do mean that, see below for the explanation.
+              'u': -1e6,
+              'n': -1e9,
+              'p': -1e12,
+              'f': -1e15,
+              'a': -1e18,
+              'z': -1e21,
+              'y': -1e24}[units[0]]
+    # Minor trickery here is an attempt to preserve accuracy by using a single
+    # divide, rather than  multiplying by 1/x, which introduces two roundings
+    # since 1/10 is not representable perfectly in IEEE floating point. (Not
+    # that this really matters, other than for cleanliness, since we're likely
+    # reading numbers with at most five decimal digits of precision).
+    return  num*factor if factor > 0 else num/-factor
+
+def readData(f):
+    line = f.readline()
+    fieldnames = [x.strip() for x in line.split(',')]
+    line = f.readline().strip()
+    data = []
+    while line != "":
+        if line[0] != '#':
+            fields = line.split(',')
+            data.append ((fields[0].strip(), [extractSI(v) for v in fields[1:]]))
+        line = f.readline().strip()
+    # Man, working out this next incantation out was non-trivial!
+    # They really want you to be snarfing data in csv or some other
+    # format they understand!
+    res = pd.DataFrame.from_items(data, columns=fieldnames[1:], orient='index')
+    return res
+
+def readTimers(f):
+    """Skip lines with leading #"""
+    line = f.readline()
+    while line[0] == '#':
+        line = f.readline()
+    line = line.strip()
+    if line == "Statistics on exit\n" or "Aggregate for all threads\n":
+        line = f.readline()
+    return readData(f)
+
+def readCounters(f):
+    """This can be just the same!"""
+    return readData(f)
+
+def readFile(fname):
+    """Read the statistics from the file. Return a dict with keys "timers", "counters" """
+    res = {}
+    try:
+        with open(fname) as f:
+            res["timers"]   = readTimers(f)
+            res["counters"] = readCounters(f)
+            return res
+    except (OSError, IOError):
+        print "Cannot open " + fname
+        return None
+
+def usefulValues(l):
+    """I.e. values which are neither null nor zero"""
+    return [p and q for (p,q) in zip (pd.notnull(l), l != 0.0)]
+
+def uselessValues(l):
+    """I.e. values which are null or zero"""
+    return [not p for p in usefulValues(l)]
+
+interestingStats = ("counters", "timers")
+statProperties   = {"counters" : ("Count", "Counter Statistics"),
+                    "timers"   : ("Time (ticks)", "Timer Statistics")
+                   }
+
+def drawChart(data, kind, filebase):
+    """Draw a summary bar chart for the requested data frame into the specified file"""
+    data["Mean"].plot(kind="bar", logy=True, grid=True, colormap="GnBu",
+                      yerr=data["SD"], ecolor="black")
+    plt.xlabel("OMP Constructs")
+    plt.ylabel(statProperties[kind][0])
+    plt.title (statProperties[kind][1])
+    plt.tight_layout()
+    plt.savefig(filebase+"_"+kind)
+
+def normalizeValues(data, countField, factor):
+    """Normalize values into a rate by dividing them all by the given factor"""
+    data[[k for k in data.keys() if k != countField]] /= factor
+
+
+def setRadarFigure(titles):
+    """Set the attributes for the radar plots"""
+    fig = plt.figure(figsize=(9,9))
+    rect = [0.1, 0.1, 0.8, 0.8]
+    labels = [0.2, 0.4, 0.6, 0.8, 1, 2, 3, 4, 5, 10]
+    matplotlib.rcParams.update({'font.size':13})
+    theta = radar_factory(len(titles))
+    ax = fig.add_axes(rect, projection='radar')
+    ax.set_rgrids(labels)
+    ax.set_varlabels(titles)
+    ax.text(theta[2], 1, "Linear->Log", horizontalalignment='center', color='green', fontsize=18)
+    return {'ax':ax, 'theta':theta}
+
+
+def drawRadarChart(data, kind, filebase, params, color):
+    """Draw the radar plots"""
+    tmp_lin = data * 0
+    tmp_log = data * 0
+    for key in data.keys():
+        if data[key] >= 1:
+           tmp_log[key] = np.log10(data[key])
+        else:
+           tmp_lin[key] = (data[key])
+    params['ax'].plot(params['theta'], tmp_log, color='b', label=filebase+"_"+kind+"_log")
+    params['ax'].plot(params['theta'], tmp_lin, color='r', label=filebase+"_"+kind+"_linear")
+    params['ax'].legend(loc='best', bbox_to_anchor=(1.4,1.2))
+    params['ax'].set_rlim((0, np.ceil(max(tmp_log))))
+
+def multiAppBarChartSettings(ax, plt, index, width, n, tmp, s):
+    ax.set_yscale('log')
+    ax.legend()
+    ax.set_xticks(index + width * n / 2)
+    ax.set_xticklabels(tmp[s]['Total'].keys(), rotation=50, horizontalalignment='right')
+    plt.xlabel("OMP Constructs")
+    plt.ylabel(statProperties[s][0])
+    plt.title(statProperties[s][1])
+    plt.tight_layout()
+
+def derivedTimerStats(data):
+    stats = {}
+    for key in data.keys():
+        if key == 'OMP_worker_thread_life':
+            totalRuntime = data['OMP_worker_thread_life']
+        elif key in ('FOR_static_iterations', 'OMP_PARALLEL_args',
+                     'OMP_set_numthreads', 'FOR_dynamic_iterations'):
+            break
+        else:
+            stats[key] = 100 * data[key] / totalRuntime
+    return stats
+
+def compPie(data):
+    compKeys = {}
+    nonCompKeys = {}
+    for key in data.keys():
+        if key in ('OMP_critical', 'OMP_single', 'OMP_serial',
+                   'OMP_parallel', 'OMP_master', 'OMP_task_immediate',
+                   'OMP_task_taskwait', 'OMP_task_taskyield', 'OMP_task_taskgroup',
+                   'OMP_task_join_bar', 'OMP_task_plain_bar', 'OMP_task_taskyield'):
+            compKeys[key] = data[key]
+        else:
+            nonCompKeys[key] = data[key]
+    print "comp keys:", compKeys, "\n\n non comp keys:", nonCompKeys
+    return [compKeys, nonCompKeys]
+
+def drawMainPie(data, filebase, colors):
+    sizes = [sum(data[0].values()), sum(data[1].values())]
+    explode = [0,0]
+    labels = ["Compute - " + "%.2f" % sizes[0], "Non Compute - " + "%.2f" % sizes[1]]
+    patches = plt.pie(sizes, explode, colors=colors, startangle=90)
+    plt.title("Time Division")
+    plt.axis('equal')
+    plt.legend(patches[0], labels, loc='best', bbox_to_anchor=(-0.1,1), fontsize=16)
+    plt.savefig(filebase+"_main_pie", bbox_inches='tight')
+
+def drawSubPie(data, tag, filebase, colors):
+    explode = []
+    labels = data.keys()
+    sizes = data.values()
+    total = sum(sizes)
+    percent = []
+    for i in range(len(sizes)):
+        explode.append(0)
+        percent.append(100 * sizes[i] / total)
+        labels[i] = labels[i] + " - %.2f" % percent[i]
+    patches = plt.pie(sizes, explode=explode, colors=colors, startangle=90)
+    plt.title(tag+"(Percentage of Total:"+" %.2f" % (sum(data.values()))+")")
+    plt.tight_layout()
+    plt.axis('equal')
+    plt.legend(patches[0], labels, loc='best', bbox_to_anchor=(-0.1,1), fontsize=16)
+    plt.savefig(filebase+"_"+tag, bbox_inches='tight')
+
+def main():
+    parser = argparse.ArgumentParser(description='''This script takes a list
+        of files containing each of which contain output from a stats-gathering
+        enabled OpenMP runtime library.  Each stats file is read, parsed, and
+        used to produce a summary of the statistics''')
+    parser.add_argument('files', nargs='+',
+        help='files to parse which contain stats-gathering output')
+    command_args = parser.parse_args()
+    colors = ['orange', 'b', 'r', 'yellowgreen', 'lightsage', 'lightpink',
+              'green', 'purple', 'yellow', 'cyan', 'mediumturquoise',
+              'olive']
+    stats = {}
+    matplotlib.rcParams.update({'font.size':22})
+    for s in interestingStats:
+        fig, ax = plt.subplots()
+        width = 0.45
+        n = 0
+        index = 0
+
+        for f in command_args.files:
+            filebase = os.path.splitext(f)[0]
+            tmp = readFile(f)
+            data = tmp[s]['Total']
+            """preventing repetition by removing rows similar to Total_OMP_work
+                as Total_OMP_work['Total'] is same as OMP_work['Total']"""
+            if s == 'counters':
+                elapsedTime = tmp["timers"]["Mean"]["OMP_worker_thread_life"]
+                normalizeValues(tmp["counters"], "SampleCount",
+                    elapsedTime / 1.e9)
+                """Plotting radar charts"""
+                params = setRadarFigure(data.keys())
+                chartType = "radar"
+                drawRadarChart(data, s, filebase, params, colors[n])
+                """radar Charts finish here"""
+                plt.savefig(filebase+"_"+s+"_"+chartType, bbox_inches='tight')
+            elif s == 'timers':
+                print "overheads in "+filebase
+                numThreads = tmp[s]['SampleCount']['Total_OMP_parallel']
+                for key in data.keys():
+                    if key[0:5] == 'Total':
+                        del data[key]
+                stats[filebase] = derivedTimerStats(data)
+                dataSubSet = compPie(stats[filebase])
+                drawMainPie(dataSubSet, filebase, colors)
+                plt.figure(0)
+                drawSubPie(dataSubSet[0], "Computational Time", filebase, colors)
+                plt.figure(1)
+                drawSubPie(dataSubSet[1], "Non Computational Time", filebase, colors)
+                with open('derivedStats_{}.csv'.format(filebase), 'w') as f:
+                    f.write('================={}====================\n'.format(filebase))
+                    f.write(pd.DataFrame(stats[filebase].items()).to_csv()+'\n')
+            n += 1
+    plt.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/final/www/README.txt b/final/www/README.txt
new file mode 100644
index 0000000..edbba19
--- /dev/null
+++ b/final/www/README.txt
@@ -0,0 +1,116 @@
+
+               README for the LLVM* OpenMP* Runtime Library
+               ============================================
+
+How to Build Documentation
+==========================
+
+The main documentation is in Doxygen* format, and this distribution
+should come with pre-built PDF documentation in doc/Reference.pdf.
+However, an HTML version can be built by executing:
+
+% doxygen doc/doxygen/config
+
+in the runtime directory.
+
+That will produce HTML documentation in the doc/doxygen/generated
+directory, which can be accessed by pointing a web browser at the
+index.html file there.
+
+If you don't have Doxygen installed, you can download it from
+www.doxygen.org.
+
+
+How to Build the LLVM* OpenMP* Runtime Library
+==============================================
+In-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp into llvm/projects
+$ cd where-you-want-to-build
+$ mkdir build && cd build
+$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make omp
+
+Out-of-tree build:
+
+$ cd where-you-want-to-live
+Check out openmp
+$ cd where-you-want-to-live/openmp
+$ mkdir build && cd build
+$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
+$ make
+
+For details about building, please look at README.rst.
+
+Architectures Supported
+=======================
+* IA-32 architecture
+* Intel(R) 64 architecture
+* Intel(R) Many Integrated Core Architecture
+* ARM* architecture
+* Aarch64 (64-bit ARM) architecture
+* IBM(R) Power architecture (big endian)
+* IBM(R) Power architecture (little endian)
+* MIPS and MIPS64 architectures
+
+Supported RTL Build Configurations
+==================================
+
+Supported Architectures: IA-32 architecture, Intel(R) 64, and
+Intel(R) Many Integrated Core Architecture
+
+              ----------------------------------------------
+              |   icc/icl     |    gcc      |   clang      |
+--------------|---------------|----------------------------|
+| Linux* OS   |   Yes(1,5)    |  Yes(2,4)   | Yes(4,6,7)   |
+| FreeBSD*    |   No          |  No         | Yes(4,6,7,8) |
+| OS X*       |   Yes(1,3,4)  |  No         | Yes(4,6,7)   |
+| Windows* OS |   Yes(1,4)    |  No         | No           |
+------------------------------------------------------------
+
+(1) On IA-32 architecture and Intel(R) 64, icc/icl versions 12.x are
+    supported (12.1 is recommended).
+(2) GCC* version 4.7 is supported.
+(3) For icc on OS X*, OS X* version 10.5.8 is supported.
+(4) Intel(R) Many Integrated Core Architecture not supported.
+(5) On Intel(R) Many Integrated Core Architecture, icc/icl versions 13.0
+    or later are required.
+(6) Clang* version 3.3 is supported.
+(7) Clang* currently does not offer a software-implemented 128 bit extended
+    precision type.  Thus, all entry points reliant on this type are removed
+    from the library and cannot be called in the user program.  The following
+    functions are not available:
+    __kmpc_atomic_cmplx16_*
+    __kmpc_atomic_float16_*
+    __kmpc_atomic_*_fp
+(8) Community contribution provided AS IS, not tested by Intel.
+
+Supported Architectures: IBM(R) Power 7 and Power 8
+
+              -----------------------------
+              |   gcc      |   clang      |
+--------------|------------|--------------|
+| Linux* OS   |  Yes(1,2)  | Yes(3,4)     |
+-------------------------------------------
+
+(1) On Power 7, gcc version 4.8.2 is supported.
+(2) On Power 8, gcc version 4.8.2 is supported.
+(3) On Power 7, clang version 3.7 is supported.
+(4) On Power 8, clang version 3.7 is supported.
+
+
+Front-end Compilers that work with this RTL
+===========================================
+
+The following compilers are known to do compatible code generation for
+this RTL: clang (from the OpenMP development branch at
+http://clang-omp.github.io/ ), Intel compilers, GCC.  See the documentation
+for more details.
+
+-----------------------------------------------------------------------
+
+Notices
+=======
+
+*Other names and brands may be claimed as the property of others.
diff --git a/final/www/Reference.pdf b/final/www/Reference.pdf
new file mode 100644
index 0000000..e97c40c
--- /dev/null
+++ b/final/www/Reference.pdf
Binary files differ
diff --git a/final/www/content.css b/final/www/content.css
new file mode 100644
index 0000000..dca6a32
--- /dev/null
+++ b/final/www/content.css
@@ -0,0 +1,27 @@
+html { margin: 0px; } body { margin: 8px; }
+
+html, body {
+  padding:0px;
+  font-size:small; font-family:"Lucida Grande", "Lucida Sans Unicode", Arial, Verdana, Helvetica, sans-serif; background-color: #fff; color: #222;
+  line-height:1.5;
+}
+
+h1, h2, h3, tt { color: #000 }
+
+h1 { padding-top:0px; margin-top:0px;}
+h2 { color:#333333; padding-top:0.5em; }
+h3 { padding-top: 0.5em; margin-bottom: -0.25em; color:#2d58b7}
+li { padding-bottom: 0.5em; }
+ul { padding-left:1.5em; }
+
+/* Slides */
+IMG.img_slide {
+    display: block;
+    margin-left: auto;
+    margin-right: auto
+}
+
+.itemTitle { color:#2d58b7 }
+
+/* Tables */
+tr { vertical-align:top }
diff --git a/final/www/index.html b/final/www/index.html
new file mode 100644
index 0000000..3d1f977
--- /dev/null
+++ b/final/www/index.html
@@ -0,0 +1,226 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+          "http://www.w3.org/TR/html4/strict.dtd">
+<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
+<html>
+<head>
+  <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
+  <title>OpenMP* : Support for the OpenMP language</title>
+  <link type="text/css" rel="stylesheet" href="menu.css">
+  <link type="text/css" rel="stylesheet" href="content.css">
+</head>
+
+<body>
+<div id="menu">
+  <div>
+    <a href="http://llvm.org/">LLVM Home</a>
+  </div>
+
+  <div class="submenu">
+    <label>OpenMP Info</label>
+    <a href="/index.html">About</a>
+  </div>
+
+  <div class="submenu">
+    <label>Quick Links</label>
+    <a href="http://lists.llvm.org/mailman/listinfo/openmp-dev">openmp-dev</a>
+    <a href="http://lists.llvm.org/mailman/listinfo/openmp-commits">openmp-commits</a>
+    <a href="http://llvm.org/bugs/">Bug Reports</a>
+    <a href="https://github.com/llvm/llvm-project/tree/master/openmp">Browse Sources</a>
+  </div>
+</div>
+
+<div id="content">
+  <!--*********************************************************************-->
+  <h1>OpenMP&reg;: Support for the OpenMP language</h1>
+  <!--*********************************************************************-->
+
+  <p>The OpenMP subproject of LLVM contains the
+     components required to build an executable OpenMP program that are
+     outside the compiler itself.
+  </p>
+
+  <p>Here you can find :-
+    <ul>
+      <li>
+        the code for the runtime library against which
+        code compiled by <tt>clang -fopenmp</tt> must be linked before it
+        can run.
+      </li>
+      <li>
+        the library that supports offload to target devices (in
+        "libomptarget")
+      </li>
+    </ul>
+  </p>
+
+  <p>Support for the parts of the OpenMP 4.0 (and later) language that are not
+  associated with the "target" constructs are contained in the
+  "runtime" directory. Support for offloading computation via the
+  "target" directive is in the separate "libomptarget" directory.
+  </p>
+
+  <p>All of the code here is <a
+     href="http://llvm.org/docs/DeveloperPolicy.html#license">dual licensed</a>
+     under the MIT license and the UIUC License (a BSD-like license).
+     The LICENSE.txt file at the top of the OpenMP project contains
+     the license text and associated patent grants.
+  </p>
+
+  <!--=====================================================================-->
+  <h2 id="dir-structure">Status</h2>
+  <!--=====================================================================-->
+
+   <p>With the release of Clang 3.8.0, OpenMP 3.1 support is enabled in
+   Clang by default, and the OpenMP runtime is therefore built as a
+   normal part of the Clang build, and distributed with the binary
+   distributions.You do not, therefore, need explicitly to check out this code, or
+   build it out of tree; a normal Clang check out and build will
+   automatically include building these runtime libraries.
+   </p>
+
+  <!--=====================================================================-->
+  <h2 id="goals">Features and Goals</h2>
+  <!--=====================================================================-->
+
+    <ul>
+        <li>Support for the <a href="http://www.openmp.org/mp-documents/OpenMP3.1.pdf">OpenMP
+          3.1 standard (PDF)</a> has been achieved in the Clang 3.8.0
+          release.
+        </li>
+
+        <li>Support for the
+ <a href="http://www.openmp.org/mp-documents/OpenMP4.0.0.pdf">OpenMP
+          4.0 standard (PDF)</a> and <a href="http://www.openmp.org/mp-documents/OpenMP4.5.pdf">OpenMP
+          4.5 standard (PDF)</a> is now being implemented. (Some OpenMP 4.0
+          and 4.5 features are already available).
+        <li>High performance.</li>
+        <li>ABI compatibility with <a href="http://gcc.gnu.org">Gcc</a> and
+        <a href="http://software.intel.com/en-us/intel-compilers">Intel's
+        existing OpenMP compilers.</a>
+        We currently have binary compatibility with OpenMP
+        3.1 code compiled by gcc 4.9, however we do not have support
+        for OpenMP 4.0 code that uses task cancellation when compiled
+        by gcc 4.9. How we will support such code remains a research issue.
+        </li>
+    </ul>
+
+  <!--=====================================================================-->
+  <h2 id="why">Why have the runtime code here?</h2>
+  <!--=====================================================================-->
+
+  <p>It makes sense to have the runtime sources in the same place
+    (and with the same license) as the compiler.
+  </p>
+
+  <!--=====================================================================-->
+  <h2 id="requirements">Platform Support</h2>
+  <!--=====================================================================-->
+
+   <p>The runtime can be built with gcc, icc or clang. However, note
+   that a runtime built with clang cannot be guaranteed to work with
+   OpenMP code compiled by the other compilers, since clang does not support
+   a 128-bit float type, and cannot therefore generate the code used
+   for reductions of that type (which may occur in user code compiled
+   by the other compilers).
+   </p>
+
+  <p>The OpenMP runtime is known to work on
+    <ul>
+      <li>ARM&reg;&nbsp; architecture processors</li>
+      <li>PowerPC&trade;&nbsp; processors</li>
+      <li>32 and 64 bit X86
+        processors when compiled with clang, with the Intel compiler
+        or with gcc, and also the Intel&reg;&nbsp;Xeon Phi&trade; product family, when compiled with
+        the Intel compiler.
+      </li>
+      <li>MIPS and MIPS64</li>
+    </ul>
+    Ports to other architectures and operating systems are welcome.
+  </p>
+
+  <p>A full OS and architecture compatibility matrix is in
+    <a href="README.txt">README.txt</a>
+  </p>
+
+
+  <!--=====================================================================-->
+  <h2>Get it and get involved!</h2>
+  <!--=====================================================================-->
+
+  <p>First please review our
+     <a href="http://llvm.org/docs/DeveloperPolicy.html">Developer's Policy</a>.
+
+  <p>To check out the code, use:</p>
+
+  <ul>
+  <li><code>git clone https://github.com/llvm/llvm-project.git</code></li>
+  </ul>
+
+  <p>In-tree build:</p>
+  <ul>
+    <li><code>cd llvm-project</code></li>
+    <li><code>mkdir build &amp;&amp; cd build</code></li>
+    <li><code>cmake ../llvm -DLLVM_ENABLE_PROJECTS=openmp -DCMAKE_C_COMPILER=&lt;C compiler&gt; -DCMAKE_CXX_COMPILER=&lt;C++ compiler&gt;</code></li>
+    <li><code>make omp</code></li>
+  </ul>
+
+  <p>Out-of-tree build:</p>
+  <ul>
+    <li><code>cd llvm-project</code></li>
+    <li><code>mkdir build-openmp &amp;&amp; cd build-openmp</code></li>
+    <li><code>cmake ../openmp -DCMAKE_C_COMPILER=&lt;C compiler&gt; -DCMAKE_CXX_COMPILER=&lt;C++ compiler&gt;</code></li>
+    <li><code>make</code></li>
+  </ul>
+
+  <p>Full details of how to build are in the
+    <a href="README.txt">README.txt</a> and README.rst in the source code repository.
+  </p>
+
+  <!--=====================================================================-->
+  <h3>Notes</h3>
+  <!--=====================================================================-->
+
+<p>
+
+</p>
+
+  <p>Send discussions to the
+  (<a href="http://lists.llvm.org/mailman/listinfo/openmp-dev">OpenMP mailing list</a>).</p>
+
+
+  <!--=====================================================================-->
+  <h2>Design Documents</h2>
+  <!--=====================================================================-->
+
+<ul>
+<li><a href="Reference.pdf">Runtime design (PDF)</a></li>
+</ul>
+
+  <!--=====================================================================-->
+  <h2>Copyright notices</h2>
+  <!--=====================================================================-->
+<ul>
+<li>
+  The OpenMP name and the OpenMP logo are registered trademarks of the
+  OpenMP Architecture Review Board.
+</li>
+<li>
+  Intel is a trademark of Intel Corporation in the U.S. and/or other
+  countries.
+</li>
+<li>
+  PowerPC is a trademark of IBM Corporation in the U.S. and/or other
+  countries.
+</li>
+<li>
+  ARM is a trademark of ARM Corporation in the U.S. and/or
+  other countries.
+</li>
+<li>
+  MIPS is a trademark of MIPS Computer Systems in the U.S. and/or
+  other countries.
+</li>
+</ul>
+</div>
+</body>
+</html>
diff --git a/final/www/menu.css b/final/www/menu.css
new file mode 100644
index 0000000..4a887b1
--- /dev/null
+++ b/final/www/menu.css
@@ -0,0 +1,39 @@
+/***************/
+/* page layout */
+/***************/
+
+[id=menu] {
+	position:fixed;
+	width:25ex;
+}
+[id=content] {
+	/* *****  EDIT THIS VALUE IF CONTENT OVERLAPS MENU ***** */
+	position:absolute;
+  left:29ex;
+	padding-right:4ex;
+}
+
+/**************/
+/* menu style */
+/**************/
+
+#menu .submenu {
+	padding-top:1em;
+	display:block;
+}
+
+#menu label {
+	display:block;
+	font-weight: bold;
+	text-align: center;
+	background-color: rgb(192,192,192);
+}
+#menu a {
+	padding:0 .2em;
+	display:block;
+	text-align: center;
+	background-color: rgb(235,235,235);
+}
+#menu a:visited {
+	color:rgb(100,50,100);
+}
